Main Function
The main function, readRunCOVIDTrackingProject() is copied:
# Function to download/load, process, segment, and analyze data from COVID Tracking Project
readRunCOVIDTrackingProject <- function(thruLabel,
downloadTo=NULL,
readFrom=downloadTo,
compareFile=NULL,
dfPerCapita=NULL,
useClusters=NULL,
hierarchical=TRUE,
returnList=!hierarchical,
kCut=6,
reAssignState=vector("list", 0),
makeCumulativePlots=TRUE,
skipAssessmentPlots=FALSE,
...
) {
# FUNCTION ARGUMENTS:
# thruLabel: the label for when the data are through (e.g., "Aug 30, 2020")
# donwloadTo: download the most recent COVID Tracking Project data to this location
# NULL means do not download any data
# readFrom: location for reading in the COVID Tracking Project data (defaults to donwloadTo)
# compareFile: name of the file to use for comparisons when reading in raw data (NULL means no comparison)
# dfPerCapita: file can be passed directly, which bypasses the loading and processing steps
# useClusters: file containing clusters by state (NULL means make the clusters from the data)
# hierarchical: boolean, should hierarchical clusters be produced (if FALSE, will be k-means)?
# returnList: boolean, should a list be returned or just the cluster object?
# refers to what is returned by clusterStates(); the main function always returns a list
# kCut: number of segments when cutting the hierarchical tree
# reAssignState: mapping file for assigning a state to another state's cluster
# format list("stateToChange"="stateClusterToAssign")
# makeCumulativePlots: whether to make plots of cumulative metrics
# skipAssessmentPlots: boolean to skip the plots for assessClusters()
# especially useful if just exploring dendrograms or silhouette widths
# ...: arguments to be passed to clusterStates(), will be used only if useClusters is NULL
# STEP 1: Get state data
stateData <- getStateData()
# STEPS 2-4 are run only if dfPerCapita does not exist
if (is.null(dfPerCapita)) {
# STEP 2a: Download latest COVID Tracking Project data (if requested)
if (!is.null(downloadTo)) downloadCOVIDbyState(fileName=downloadTo)
# STEP 2b: Read-in COVID Tracking Project data
dfRaw <- readCOViDbyState(readFrom, checkFile=compareFile)
glimpse(dfRaw)
# STEP 3: Process the data so that it includes all requested key variables
varsFilter <- c("date", "state", "positiveIncrease", "deathIncrease",
"hospitalizedCurrently", "totalTestResultsIncrease"
)
dfFiltered <- processCVData(dfRaw,
varsKeep=varsFilter,
varsRename=c(positiveIncrease="cases",
deathIncrease="deaths",
hospitalizedCurrently="hosp",
totalTestResultsIncrease="tests"
)
)
glimpse(dfFiltered)
# STEP 4: Convert to per capita
dfPerCapita <- helperMakePerCapita(dfFiltered,
mapVars=c("cases"="cpm", "deaths"="dpm",
"hosp"="hpm", "tests"="tpm"
),
popData=stateData
)
glimpse(dfPerCapita)
} else {
dfRaw <- NULL
dfFiltered <- NULL
}
# STEP 5: Create the clusters (if they have not been passed)
if (is.null(useClusters)) {
# Run the clustering process
clData <- clusterStates(df=dfPerCapita, hierarchical=hierarchical, returnList=returnList, ...)
# If hierarchical clusters, cut the tree, otherwise use the output object directly
if (hierarchical) {
useClusters <- cutree(clData, k=kCut)
} else {
useClusters <- clData$objCluster$cluster
}
# If requested, manually assign clusters to the cluster for another state
for (xNum in seq_len(length(reAssignState))) {
useClusters[names(reAssignState)[xNum]] <- useClusters[reAssignState[[xNum]]]
}
}
# STEP 5a: Stop the process and return what is available if skipAssessmentPlots is TRUE
if (skipAssessmentPlots) {
return(list(stateData=stateData,
dfRaw=dfRaw,
dfFiltered=dfFiltered,
dfPerCapita=dfPerCapita,
useClusters=useClusters,
plotData=NULL,
consolidatedPlotData=NULL,
clCum=NULL
)
)
}
# STEP 6: Create the cluster assessments
plotData <- assessClusters(useClusters,
dfState=stateData,
dfBurden=dfPerCapita,
thruLabel=thruLabel,
plotsTogether=TRUE
)
# STEP 7: Plot the consolidated metrics
subT <- "Cases: new cases, Deaths: new deaths, Hosp: total in hospital (not new), Tests: new tests"
consolidatedPlotData <- plotConsolidatedMetrics(plotData,
varMain=c("state", "cluster", "date", "pop",
"cases", "deaths", "hosp", "tests"
),
subT=subT,
nrowPlot2=2
)
# STEP 8: Create cumulative metrics if requested
if (makeCumulativePlots) {
consPos <- consolidatedPlotData %>%
ungroup() %>%
select(state, cluster, date, name, vpm7) %>%
arrange(state, cluster, date, name) %>%
pivot_wider(-vpm7, names_from="name", values_from="vpm7") %>%
mutate(pctpos=cases/tests) %>%
pivot_longer(-c(state, cluster, date), values_to="vpm7") %>%
filter(!is.na(vpm7))
clCum <- makeCumulative(consPos)
plotCumulativeData(clCum,
keyMetricp2="",
flagsp2="",
makep1=TRUE,
makep2=FALSE
)
plotCumulativeData(clCum,
keyMetricp2="deaths",
flagsp2=findFlagStates(clCum, keyMetricVal = "deaths")
)
plotCumulativeData(clCum,
keyMetricp2="cases",
flagsp2=findFlagStates(clCum, keyMetricVal = "cases")
)
plotCumulativeData(clCum,
keyMetricp2="tests",
flagsp2=findFlagStates(clCum, keyMetricVal = "tests")
)
} else {
clCum <- NULL
}
# STEP 9: Return a list of the key data
list(stateData=stateData,
dfRaw=dfRaw,
dfFiltered=dfFiltered,
dfPerCapita=dfPerCapita,
useClusters=useClusters,
plotData=plotData,
consolidatedPlotData=consolidatedPlotData,
clCum=clCum
)
}
The following functions are examined for update:
- getStateData - updated to default to using 2019 census data
- downloadCOVIDbyState - function is OK, copied as-is
- readCOVIDbyState - updated for cleaner log file
- processCVData - function is OK, copied as-is
- helperMakePerCapita - function is OK, copied as-is
- clusterStates - updated with a custom shape function for year-month
- assessClusters - function is OK
- plotConsolidatedMetrics - function is OK
- makeCumulative - function is OK
- plotCumulativeData - function is OK
So, the main effort will be to update the clusterStates() function. There is also an opportunity to clean up the other functions so that output is directed to a separate log file and warnings caused by not setting .groups can also be addressed.
State population data have been downloaded from US Census. The file is processed and saved so that it can be used in an updated getStateData() function:
# Read in the state population file
statePop2019 <- readr::read_csv("./RInputFiles/Coronavirus/SCPRC-EST2019-18+POP-RES.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## SUMLEV = col_character(),
## REGION = col_character(),
## DIVISION = col_character(),
## STATE = col_character(),
## NAME = col_character(),
## POPESTIMATE2019 = col_double(),
## POPEST18PLUS2019 = col_double(),
## PCNT_POPEST18PLUS = col_double()
## )
# Create mapping file for state name to state abbreviation
stateMap <- tibble::tibble(stateName=c(state.name, "District of Columbia"), stateAbb=c(state.abb, "DC"))
# Create fields for state (abbreviation) and pop_2019
statePop2019 <- statePop2019 %>%
full_join(stateMap, by=c("NAME"="stateName")) %>%
mutate(pop_2019=POPESTIMATE2019)
# Check if anything did not merge properly
# Expected that United States and Puerto Rico will not merge, others should be a good match
statePop2019 %>%
filter(is.na(stateAbb) | is.na(pop_2019))
## # A tibble: 2 x 10
## SUMLEV REGION DIVISION STATE NAME POPESTIMATE2019 POPEST18PLUS2019
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 010 0 0 00 Unit~ 328239523 255200373
## 2 040 X X 72 Puer~ 3193694 2620963
## # ... with 3 more variables: PCNT_POPEST18PLUS <dbl>, stateAbb <chr>,
## # pop_2019 <dbl>
# Delete the Puerto Rico and US totals data
statePop2019 <- statePop2019 %>%
filter(!is.na(stateAbb))
# Glimpse file and then save to RDS
glimpse(statePop2019)
## Rows: 51
## Columns: 10
## $ SUMLEV <chr> "040", "040", "040", "040", "040", "040", "040", ...
## $ REGION <chr> "3", "4", "4", "3", "4", "4", "1", "3", "3", "3",...
## $ DIVISION <chr> "6", "9", "8", "7", "9", "8", "1", "5", "5", "5",...
## $ STATE <chr> "01", "02", "04", "05", "06", "08", "09", "10", "...
## $ NAME <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "Cali...
## $ POPESTIMATE2019 <dbl> 4903185, 731545, 7278717, 3017804, 39512223, 5758...
## $ POPEST18PLUS2019 <dbl> 3814879, 551562, 5638481, 2317649, 30617582, 4499...
## $ PCNT_POPEST18PLUS <dbl> 77.8, 75.4, 77.5, 76.8, 77.5, 78.1, 79.6, 79.1, 8...
## $ stateAbb <chr> "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "...
## $ pop_2019 <dbl> 4903185, 731545, 7278717, 3017804, 39512223, 5758...
saveToRDS(statePop2019, ovrWrite=FALSE, ovrWriteError=FALSE)
##
## File already exists: ./RInputFiles/Coronavirus/statePop2019.RDS
##
## Not replacing the existing file since ovrWrite=FALSE
## NULL
The getStateData() function is updated to use defaults for this new file:
# Function to extract and format key state data
getStateData <- function(df=readFromRDS("statePop2019"),
renameVars=c("stateAbb"="state", "NAME"="name", "pop_2019"="pop"),
keepVars=c("state", "name", "pop")
) {
# FUNCTION ARGUMENTS:
# df: the data frame containing state data
# renameVars: variables to be renamed, using named list with format "originalName"="newName"
# keepVars: variables to be kept in the final file
# Rename variables where appropriate
names(df) <- ifelse(is.na(renameVars[names(df)]), names(df), renameVars[names(df)])
# Return file with only key variables kept
df %>%
select_at(vars(all_of(keepVars)))
}
The function is then tested, with totals by state compared against previous:
# Using the new defaults
pop_2019 <- getStateData()
pop_2019
## # A tibble: 51 x 3
## state name pop
## <chr> <chr> <dbl>
## 1 AL Alabama 4903185
## 2 AK Alaska 731545
## 3 AZ Arizona 7278717
## 4 AR Arkansas 3017804
## 5 CA California 39512223
## 6 CO Colorado 5758736
## 7 CT Connecticut 3565287
## 8 DE Delaware 973764
## 9 DC District of Columbia 705749
## 10 FL Florida 21477737
## # ... with 41 more rows
# Comparison to previous
fullPop <- pop_2019 %>%
full_join(usmap::statepop, by=c("state"="abbr")) %>%
mutate(pctChg=pop/pop_2015-1)
# Flag for any differences in name
fullPop %>%
filter(is.na(name) | is.na(full) | name != full)
## # A tibble: 0 x 7
## # ... with 7 variables: state <chr>, name <chr>, pop <dbl>, fips <chr>,
## # full <chr>, pop_2015 <dbl>, pctChg <dbl>
# Plot population and percent change
fullPop %>%
ggplot(aes(x=fct_reorder(state, pctChg), y=pctChg)) +
geom_col(fill="lightblue") +
coord_flip() +
labs(title="Change in population from usmap::statepop to US Census 2019 estimate",
x="",
y="Percent change from 2015 (usmap) to 2019 (US Census)"
)
The data appear to be reasonably aligned, with fast growing states and shrinking states roughly as expected.
The function downloadCOVIDbyState() appears to be OK as-is, and is copied below:
# NO CHANGES MADE TO FUNCTION
# Function to download data for COVID Tracking Project
downloadCOVIDbyState <- function(fileName,
api="https://api.covidtracking.com/v1/states/daily.csv",
ovrWrite=FALSE
) {
# COVID Tracking Project API allows data downloads for personal, non-commercial use
# https://covidtracking.com/data/api
# FUNCTION ARGUMENTS:
# fileName: the filename that the data will be saved to
# api: The API link for data downloads
# ovrWrite: whether to allow overwriting of the existing fileName
# Check whether fileName already exists
if (file.exists(fileName)) {
cat("\nFile already exists at:", fileName, "\n")
if (ovrWrite) cat("Will over-write with current data from", api, "\n")
else stop("Exiting due to ovrWrite=FALSE and a duplicate fileName\n")
}
# Download the file
download.file(api, destfile=fileName)
# Show statistics on downloaded file
file.info(fileName)
}
The download function is then run using 2021 data:
# Example for downloading the 22-JAN-21 data file
locDownload <- "./RInputFiles/Coronavirus/CV_downloaded_210122.csv"
if (!file.exists(locDownload)) downloadCOVIDbyState(fileName=locDownload)
## size isdir mode
## ./RInputFiles/Coronavirus/CV_downloaded_210122.csv 5003425 FALSE 666
## mtime
## ./RInputFiles/Coronavirus/CV_downloaded_210122.csv 2021-01-22 09:08:23
## ctime
## ./RInputFiles/Coronavirus/CV_downloaded_210122.csv 2021-01-22 09:08:18
## atime exe
## ./RInputFiles/Coronavirus/CV_downloaded_210122.csv 2021-01-22 09:08:23 no
The function readCOVIDbyState() is copied below, with capability to direct control totals for changed items in an easier to read manner:
# Function to read, convert, and sanity check a downloaded file
readCOViDbyState <- function(fileName,
checkFile=NULL,
controlFields=c("positiveIncrease", "deathIncrease", "hospitalizedCurrently"),
controlBy=c("state"),
dateChangePlot=FALSE,
dateMetricPrint=TRUE,
controlByMetricPrint=TRUE,
writeLog=NULL,
ovrwriteLog=TRUE
) {
# FUNCTION ARGUMENTS:
# fileName: the file name for reading the data
# checkFile: a file that can be used for comparison purposes (NULL means do not compare to anything)
# controlFields: fields that will be explicitly checked against checkFile
# controlBy: level of aggregation at which fields will be explicitly checked against checkFile
# dateChangePlot: boolean, should the change in dates included be plotte rather than listed?
# dateMetricPrint: boolean, should the list of date-metric changes be printed?
# controlByMetricPrint: boolean, should the list of controlBy-metric changes be printed?
# writeLog: write detailed comparison to log file (NULL means do not write)
# ovrwriteLog: boolean, should the log be started from scratch with the date comparisons?
# Helper function to check for similarity of key elements
helperSimilarity <- function(newData, refData, label, countOnly=FALSE, logFile=NULL, logAppend=TRUE) {
d1 <- setdiff(refData, newData)
d2 <- setdiff(newData, refData)
cat("\n\nChecking for similarity of:", label)
cat("\nIn reference but not in current:", if(countOnly) length(d1) else d1)
cat("\nIn current but not in reference:", if(countOnly) length(d2) else d2)
if (countOnly & !is.null(logFile)) {
cat("\nDetailed differences available in:", logFile)
capture.output(cat("\nIn reference but not in current:\n", paste(d1, collapse="\n"), sep=""),
cat("\nIn current but not in reference:\n", paste(d2, collapse="\n"), sep=""),
file=logFile,
append=logAppend
)
}
if (countOnly) return(list(d1=d1, d2=d2))
}
# Read in the file and convert the numeric date field to date using ymd format
df <- readr::read_csv(fileName) %>%
mutate(date=lubridate::ymd(date))
# Check that the file is unique by date-state
if ((df %>% select(date, state) %>% anyDuplicated()) != 0) {
stop("\nDuplicates by date and state, investigate and fix\n")
} else {
cat("\nFile is unique by state and date\n")
}
# Check for overall control totals in new file
cat("\n\nOverall control totals in file:\n")
df %>%
summarize_at(vars(all_of(controlFields)), .funs=sum, na.rm=TRUE) %>%
print()
# Get control totals by date for new file
dfByDate <- df %>%
group_by(date) %>%
summarize_at(vars(all_of(controlFields)), .funs=sum, na.rm=TRUE) %>%
ungroup() %>%
pivot_longer(-date, values_to="newValue")
# If there is no checkFile, then just produce a plot of the key metrics
if (is.null(checkFile)) {
p1 <- dfByDate %>%
ggplot(aes(x=date, y=newValue)) +
geom_line() +
facet_wrap(~name, nrow=1, scales="free_y") +
labs(title="Control totals by date for new file (no reference file)", x="", y="Summed Value")
print(p1)
} else {
# Check for similarity of fields, dates, and states
cat("\n*** COMPARISONS TO REFERENCE FILE:", deparse(substitute(checkFile)))
helperSimilarity(newData=names(df), refData=names(checkFile), label="column names")
helperSimilarity(newData=df %>% pull(state) %>% unique(),
refData=checkFile %>% pull(state) %>% unique(),
label="states"
)
dateChangeList <- helperSimilarity(newData=df %>%
pull(date) %>%
unique() %>%
format("%Y-%m-%d") %>%
sort(),
refData=checkFile %>%
pull(date) %>%
unique() %>%
format("%Y-%m-%d") %>%
sort(),
label="dates",
countOnly=dateChangePlot,
logFile=writeLog,
logAppend=!ovrwriteLog
)
# Plot date changes if requested
if (dateChangePlot) {
pDate <- tibble::tibble(date=as.Date(c(dateChangeList$d1, dateChangeList$d2)),
type=c(rep("Control File Only", length(dateChangeList$d1)),
rep("New File Only", length(dateChangeList$d2))
)
) %>%
ggplot(aes(x=date, fill=type)) +
geom_bar() +
coord_flip() +
labs(x="", y="", title="Dates in one file and not in the other")
print(pDate)
}
# Check for similarity of control totals by date in files
checkByDate <- checkFile %>%
group_by(date) %>%
summarize_at(vars(all_of(controlFields)), .funs=sum, na.rm=TRUE) %>%
ungroup() %>%
pivot_longer(-date, values_to="oldValue")
deltaDate <- dfByDate %>%
inner_join(checkByDate, by=c("date", "name")) %>%
filter(abs(newValue-oldValue)>=5,
pmax(newValue, oldValue)>=1.01*pmin(newValue, oldValue)
) %>%
as.data.frame()
cat("\n\nDifference of 5+ that is at least 1% (summed to date and metric):", nrow(deltaDate))
if (dateMetricPrint) {
cat("\n")
print(deltaDate)
}
else if (!is.null(writeLog)) {
cat("\nDetailed output available in log:", writeLog)
capture.output(cat("\n\nChange by date:\n"), print(deltaDate), file=writeLog, append=TRUE)
}
p1 <- dfByDate %>%
full_join(checkByDate, by=c("date", "name")) %>%
pivot_longer(-c(date, name), names_to="newOld") %>%
ggplot(aes(x=date, y=value, group=newOld, color=newOld)) +
geom_line() +
facet_wrap(~name, nrow=1, scales="free_y") +
labs(title="Control totals by date for new and reference file", x="", y="Summed Value")
print(p1)
# Check for similarity of control totals by controlBy in files
dfByControl <- df %>%
semi_join(select(checkFile, date), by="date") %>%
group_by_at(vars(all_of(controlBy))) %>%
summarize_at(vars(all_of(controlFields)), .funs=sum, na.rm=TRUE) %>%
ungroup() %>%
pivot_longer(-all_of(controlBy), values_to="newValue")
checkByControl <- checkFile %>%
group_by_at(vars(all_of(controlBy))) %>%
summarize_at(vars(all_of(controlFields)), .funs=sum, na.rm=TRUE) %>%
ungroup() %>%
pivot_longer(-all_of(controlBy), values_to="oldValue")
deltaBy <- dfByControl %>%
inner_join(checkByControl, by=c(controlBy, "name")) %>%
filter(abs(newValue-oldValue)>=5,
pmax(newValue, oldValue)>=1.01*pmin(newValue, oldValue)
) %>%
as.data.frame()
cat("\n\nDifference of 5+ that is at least 1% (summed to",
controlBy,
"and metric):",
nrow(deltaBy),
"\n"
)
if (controlByMetricPrint) print(deltaBy)
}
# Return the processed data file
df
}
The function is then applied to the downloaded 21-JAN-21 data, including a comparison to a previous 2020 file:
# Reading the file as a standalone
df1 <- readCOViDbyState(locDownload)
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## state = col_character(),
## totalTestResultsSource = col_character(),
## dataQualityGrade = col_character(),
## lastUpdateEt = col_character(),
## dateModified = col_datetime(format = ""),
## checkTimeEt = col_character(),
## dateChecked = col_datetime(format = ""),
## fips = col_character(),
## hash = col_character(),
## grade = col_logical()
## )
## i Use `spec()` for the full column specifications.
##
## File is unique by state and date
##
##
## Overall control totals in file:
## # A tibble: 1 x 3
## positiveIncrease deathIncrease hospitalizedCurrently
## <dbl> <dbl> <dbl>
## 1 24295426 400726 17340298
# Reading the file with a comparison to a previous file, previous approach
df2 <- readCOViDbyState(locDownload,
checkFile=readFromRDS("test_hier5_201001")$dfRaw
)
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## state = col_character(),
## totalTestResultsSource = col_character(),
## dataQualityGrade = col_character(),
## lastUpdateEt = col_character(),
## dateModified = col_datetime(format = ""),
## checkTimeEt = col_character(),
## dateChecked = col_datetime(format = ""),
## fips = col_character(),
## hash = col_character(),
## grade = col_logical()
## )
## i Use `spec()` for the full column specifications.
##
## File is unique by state and date
##
##
## Overall control totals in file:
## # A tibble: 1 x 3
## positiveIncrease deathIncrease hospitalizedCurrently
## <dbl> <dbl> <dbl>
## 1 24295426 400726 17340298
##
## *** COMPARISONS TO REFERENCE FILE: readFromRDS("test_hier5_201001")$dfRaw
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference: probableCases
##
## Checking for similarity of: states
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: dates
## In reference but not in current:
## In current but not in reference: 2020-01-13 2020-01-14 2020-01-15 2020-01-16 2020-01-17 2020-01-18 2020-01-19 2020-01-20 2020-01-21 2020-10-01 2020-10-02 2020-10-03 2020-10-04 2020-10-05 2020-10-06 2020-10-07 2020-10-08 2020-10-09 2020-10-10 2020-10-11 2020-10-12 2020-10-13 2020-10-14 2020-10-15 2020-10-16 2020-10-17 2020-10-18 2020-10-19 2020-10-20 2020-10-21 2020-10-22 2020-10-23 2020-10-24 2020-10-25 2020-10-26 2020-10-27 2020-10-28 2020-10-29 2020-10-30 2020-10-31 2020-11-01 2020-11-02 2020-11-03 2020-11-04 2020-11-05 2020-11-06 2020-11-07 2020-11-08 2020-11-09 2020-11-10 2020-11-11 2020-11-12 2020-11-13 2020-11-14 2020-11-15 2020-11-16 2020-11-17 2020-11-18 2020-11-19 2020-11-20 2020-11-21 2020-11-22 2020-11-23 2020-11-24 2020-11-25 2020-11-26 2020-11-27 2020-11-28 2020-11-29 2020-11-30 2020-12-01 2020-12-02 2020-12-03 2020-12-04 2020-12-05 2020-12-06 2020-12-07 2020-12-08 2020-12-09 2020-12-10 2020-12-11 2020-12-12 2020-12-13 2020-12-14 2020-12-15 2020-12-16 2020-12-17 2020-12-18 2020-12-19 2020-12-20 2020-12-21 2020-12-22 2020-12-23 2020-12-24 2020-12-25 2020-12-26 2020-12-27 2020-12-28 2020-12-29 2020-12-30 2020-12-31 2021-01-01 2021-01-02 2021-01-03 2021-01-04 2021-01-05 2021-01-06 2021-01-07 2021-01-08 2021-01-09 2021-01-10 2021-01-11 2021-01-12 2021-01-13 2021-01-14 2021-01-15 2021-01-16 2021-01-17 2021-01-18 2021-01-19 2021-01-20 2021-01-21
##
## Difference of 5+ that is at least 1% (summed to date and metric): 163
## date name newValue oldValue
## 1 2020-02-29 positiveIncrease 3 18
## 2 2020-03-01 positiveIncrease 8 16
## 3 2020-03-02 positiveIncrease 30 44
## 4 2020-03-03 positiveIncrease 42 48
## 5 2020-03-05 positiveIncrease 63 103
## 6 2020-03-06 positiveIncrease 129 108
## 7 2020-03-07 positiveIncrease 135 175
## 8 2020-03-08 positiveIncrease 170 198
## 9 2020-03-09 positiveIncrease 276 292
## 10 2020-03-11 positiveIncrease 420 509
## 11 2020-03-12 positiveIncrease 683 671
## 12 2020-03-13 positiveIncrease 843 1072
## 13 2020-03-14 positiveIncrease 1025 924
## 14 2020-03-16 positiveIncrease 1706 1739
## 15 2020-03-17 positiveIncrease 2088 2588
## 16 2020-03-18 positiveIncrease 3357 3089
## 17 2020-03-21 positiveIncrease 6932 6793
## 18 2020-03-21 hospitalizedCurrently 1492 1436
## 19 2020-03-22 positiveIncrease 9223 9125
## 20 2020-03-23 positiveIncrease 11192 11439
## 21 2020-03-23 hospitalizedCurrently 2812 2770
## 22 2020-03-24 positiveIncrease 10882 10769
## 23 2020-03-25 positiveIncrease 12631 12908
## 24 2020-03-25 deathIncrease 241 235
## 25 2020-03-25 hospitalizedCurrently 5140 5062
## 26 2020-03-26 deathIncrease 313 320
## 27 2020-03-28 deathIncrease 551 538
## 28 2020-03-29 deathIncrease 505 521
## 29 2020-03-30 positiveIncrease 21202 22042
## 30 2020-03-31 deathIncrease 908 890
## 31 2020-04-01 positiveIncrease 26271 25791
## 32 2020-04-05 positiveIncrease 25910 25500
## 33 2020-04-06 positiveIncrease 28243 29002
## 34 2020-04-07 positiveIncrease 30476 30885
## 35 2020-04-09 positiveIncrease 35116 34503
## 36 2020-04-10 positiveIncrease 33608 34380
## 37 2020-04-10 deathIncrease 2083 2108
## 38 2020-04-11 positiveIncrease 31263 30501
## 39 2020-04-12 positiveIncrease 28115 27784
## 40 2020-04-13 positiveIncrease 24281 25195
## 41 2020-04-15 positiveIncrease 29921 30307
## 42 2020-04-16 positiveIncrease 31527 30978
## 43 2020-04-20 deathIncrease 1816 1798
## 44 2020-04-21 positiveIncrease 26039 26367
## 45 2020-04-23 deathIncrease 1809 1791
## 46 2020-04-24 deathIncrease 1974 1895
## 47 2020-04-25 deathIncrease 1629 1748
## 48 2020-04-27 positiveIncrease 22407 22708
## 49 2020-04-27 deathIncrease 1290 1270
## 50 2020-04-29 deathIncrease 2676 2713
## 51 2020-05-01 deathIncrease 1809 1778
## 52 2020-05-02 deathIncrease 1527 1564
## 53 2020-05-03 deathIncrease 1248 1231
## 54 2020-05-04 positiveIncrease 22195 22649
## 55 2020-05-05 deathIncrease 2490 2452
## 56 2020-05-06 deathIncrease 1918 1950
## 57 2020-05-07 positiveIncrease 27229 27537
## 58 2020-05-11 positiveIncrease 18140 18377
## 59 2020-05-12 positiveIncrease 22442 22890
## 60 2020-05-12 deathIncrease 1509 1487
## 61 2020-05-13 positiveIncrease 21500 21285
## 62 2020-05-13 deathIncrease 1730 1704
## 63 2020-05-14 deathIncrease 1853 1880
## 64 2020-05-15 positiveIncrease 25490 24685
## 65 2020-05-15 deathIncrease 1538 1507
## 66 2020-05-16 positiveIncrease 23743 24702
## 67 2020-05-16 deathIncrease 1237 988
## 68 2020-05-17 positiveIncrease 20436 20009
## 69 2020-05-17 deathIncrease 873 849
## 70 2020-05-18 positiveIncrease 20597 21028
## 71 2020-05-19 positiveIncrease 20687 20897
## 72 2020-05-21 deathIncrease 1380 1394
## 73 2020-05-22 positiveIncrease 24115 24433
## 74 2020-05-22 deathIncrease 1290 1341
## 75 2020-05-23 positiveIncrease 22561 21531
## 76 2020-05-23 deathIncrease 1040 1063
## 77 2020-05-24 positiveIncrease 19062 20072
## 78 2020-05-24 deathIncrease 688 680
## 79 2020-05-26 deathIncrease 665 645
## 80 2020-05-27 positiveIncrease 19172 19447
## 81 2020-05-27 deathIncrease 1335 1321
## 82 2020-06-01 positiveIncrease 20101 20482
## 83 2020-06-01 deathIncrease 679 668
## 84 2020-06-02 positiveIncrease 19879 20112
## 85 2020-06-03 positiveIncrease 20182 20390
## 86 2020-06-03 deathIncrease 975 993
## 87 2020-06-04 positiveIncrease 20477 20886
## 88 2020-06-04 deathIncrease 883 893
## 89 2020-06-05 positiveIncrease 23050 23394
## 90 2020-06-05 deathIncrease 835 826
## 91 2020-06-06 positiveIncrease 22746 23064
## 92 2020-06-06 deathIncrease 714 728
## 93 2020-06-07 positiveIncrease 19056 18740
## 94 2020-06-08 positiveIncrease 16923 17209
## 95 2020-06-08 deathIncrease 675 661
## 96 2020-06-09 positiveIncrease 16916 17312
## 97 2020-06-09 deathIncrease 886 902
## 98 2020-06-12 positiveIncrease 23141 23597
## 99 2020-06-12 deathIncrease 766 775
## 100 2020-06-14 positiveIncrease 21658 21399
## 101 2020-06-15 positiveIncrease 18255 18649
## 102 2020-06-16 positiveIncrease 22838 23478
## 103 2020-06-16 deathIncrease 720 730
## 104 2020-06-17 deathIncrease 780 767
## 105 2020-06-18 positiveIncrease 27042 27746
## 106 2020-06-18 deathIncrease 682 705
## 107 2020-06-19 positiveIncrease 30865 31471
## 108 2020-06-20 deathIncrease 615 629
## 109 2020-06-21 positiveIncrease 29188 27928
## 110 2020-06-22 positiveIncrease 26829 27281
## 111 2020-06-23 deathIncrease 724 710
## 112 2020-06-24 deathIncrease 704 724
## 113 2020-06-26 deathIncrease 618 637
## 114 2020-06-29 positiveIncrease 39398 39813
## 115 2020-06-30 positiveIncrease 47010 47864
## 116 2020-06-30 deathIncrease 579 596
## 117 2020-07-02 positiveIncrease 53511 54085
## 118 2020-07-04 deathIncrease 296 306
## 119 2020-07-06 positiveIncrease 40925 41959
## 120 2020-07-06 deathIncrease 237 243
## 121 2020-07-07 positiveIncrease 50990 51687
## 122 2020-07-07 deathIncrease 905 923
## 123 2020-07-08 deathIncrease 819 807
## 124 2020-07-10 deathIncrease 839 854
## 125 2020-07-13 positiveIncrease 57160 58133
## 126 2020-07-14 positiveIncrease 58609 62687
## 127 2020-07-15 positiveIncrease 69373 65797
## 128 2020-07-17 deathIncrease 939 951
## 129 2020-07-20 deathIncrease 376 363
## 130 2020-07-21 positiveIncrease 62920 63930
## 131 2020-07-22 deathIncrease 1149 1171
## 132 2020-07-23 deathIncrease 1070 1056
## 133 2020-07-27 positiveIncrease 54479 55332
## 134 2020-07-31 deathIncrease 1328 1312
## 135 2020-08-02 positiveIncrease 53301 46812
## 136 2020-08-03 positiveIncrease 42738 49713
## 137 2020-08-04 positiveIncrease 51198 51866
## 138 2020-08-04 deathIncrease 1241 1255
## 139 2020-08-10 positiveIncrease 41370 42089
## 140 2020-08-11 positiveIncrease 54935 55701
## 141 2020-08-14 positiveIncrease 57101 55636
## 142 2020-08-18 positiveIncrease 40070 40795
## 143 2020-08-18 deathIncrease 1179 1196
## 144 2020-08-25 positiveIncrease 36839 36379
## 145 2020-08-29 positiveIncrease 43995 44501
## 146 2020-08-30 positiveIncrease 38766 39501
## 147 2020-08-31 deathIncrease 380 366
## 148 2020-09-01 deathIncrease 1014 1027
## 149 2020-09-07 positiveIncrease 28117 28682
## 150 2020-09-08 deathIncrease 347 358
## 151 2020-09-15 positiveIncrease 34778 35445
## 152 2020-09-17 deathIncrease 880 863
## 153 2020-09-18 positiveIncrease 46889 47486
## 154 2020-09-20 positiveIncrease 35533 36295
## 155 2020-09-21 deathIncrease 281 287
## 156 2020-09-23 positiveIncrease 39498 38567
## 157 2020-09-24 deathIncrease 938 921
## 158 2020-09-26 positiveIncrease 47268 47836
## 159 2020-09-27 positiveIncrease 34990 35454
## 160 2020-09-28 positiveIncrease 35376 36524
## 161 2020-09-28 deathIncrease 246 257
## 162 2020-09-29 deathIncrease 724 739
## 163 2020-09-30 positiveIncrease 44909 44424
## Warning: Removed 122 row(s) containing missing values (geom_path).
##
##
## Difference of 5+ that is at least 1% (summed to state and metric): 9
## state name newValue oldValue
## 1 AK positiveIncrease 7889 8780
## 2 CO deathIncrease 2051 1952
## 3 FL positiveIncrease 698051 706514
## 4 HI positiveIncrease 12469 12289
## 5 ND deathIncrease 249 191
## 6 NJ positiveIncrease 210458 205275
## 7 PR positiveIncrease 23995 48750
## 8 WA positiveIncrease 89737 87042
## 9 WA hospitalizedCurrently 83519 63095
# Reading the file with a comparison to a previous file, key output directed to a log
df3 <- readCOViDbyState(locDownload,
checkFile=readFromRDS("test_hier5_201001")$dfRaw,
dateChangePlot=TRUE,
dateMetricPrint=FALSE,
writeLog="./RInputFiles/Coronavirus/testLogCTP_v001.log",
ovrwriteLog=TRUE
)
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## state = col_character(),
## totalTestResultsSource = col_character(),
## dataQualityGrade = col_character(),
## lastUpdateEt = col_character(),
## dateModified = col_datetime(format = ""),
## checkTimeEt = col_character(),
## dateChecked = col_datetime(format = ""),
## fips = col_character(),
## hash = col_character(),
## grade = col_logical()
## )
## i Use `spec()` for the full column specifications.
##
## File is unique by state and date
##
##
## Overall control totals in file:
## # A tibble: 1 x 3
## positiveIncrease deathIncrease hospitalizedCurrently
## <dbl> <dbl> <dbl>
## 1 24295426 400726 17340298
##
## *** COMPARISONS TO REFERENCE FILE: readFromRDS("test_hier5_201001")$dfRaw
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference: probableCases
##
## Checking for similarity of: states
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: dates
## In reference but not in current: 0
## In current but not in reference: 122
## Detailed differences available in: ./RInputFiles/Coronavirus/testLogCTP_v001.log
##
##
## Difference of 5+ that is at least 1% (summed to date and metric): 163
## Detailed output available in log: ./RInputFiles/Coronavirus/testLogCTP_v001.log
## Warning: Removed 122 row(s) containing missing values (geom_path).
##
##
## Difference of 5+ that is at least 1% (summed to state and metric): 9
## state name newValue oldValue
## 1 AK positiveIncrease 7889 8780
## 2 CO deathIncrease 2051 1952
## 3 FL positiveIncrease 698051 706514
## 4 HI positiveIncrease 12469 12289
## 5 ND deathIncrease 249 191
## 6 NJ positiveIncrease 210458 205275
## 7 PR positiveIncrease 23995 48750
## 8 WA positiveIncrease 89737 87042
## 9 WA hospitalizedCurrently 83519 63095
# Confirm that files are identical
identical(df1, df2)
## [1] TRUE
identical(df1, df3)
## [1] TRUE
The added functionality allows for a cleaner log, with detailed output optionally sent to a separate file. Next steps are to continue updating elements of the main function.
The processCVData() and helperMakePerCapita() functions are copied, along with the associated helper function:
# Function to select relevant variables and observations, and report on control totals
processCVData <- function(dfFull,
varsKeep=c("date", "state", "positiveIncrease", "deathIncrease"),
varsRename=c("positiveIncrease"="cases", "deathIncrease"="deaths"),
stateList=c(state.abb, "DC")
) {
# FUNCTION ARGUMENTS
# dfFull: the full data file originally loaded
# varsKeep: variables to keep from the full file
# varsRename: variables to be renamed, using a named vector of form originalName=newName
# stateList: variables for filtering state (NULL means do not run any filters)
# Select only the key variables
df <- dfFull %>%
select_at(vars(all_of(varsKeep)))
# Apply the renaming of variables
names(df) <- ifelse(is.na(varsRename[names(df)]), names(df), varsRename[names(df)])
# Designate each record as being either a valid state or not
if (!is.null(stateList)) {
df <- df %>%
mutate(validState=state %in% stateList)
} else {
df <- df %>%
mutate(validState=TRUE)
}
# Summarize the control totals for the data, based on whether the state is valid
cat("\n\nControl totals - note that validState other than TRUE will be discarded\n(na.rm=TRUE)\n\n")
df %>%
mutate(n=1) %>%
group_by(validState) %>%
summarize_if(is.numeric, sum, na.rm=TRUE) %>%
print()
# Return the file, filtered to where validState is TRUE, and deleting variable validState
df %>%
filter(validState) %>%
select(-validState)
}
# Function to add per capita and rolling to the base data frame
# Updated function to take an arbitrary number of variables and convert them
helperMakePerCapita <- function(df,
mapVars=c("cases"="cpm", "deaths"="dpm"),
popData=stateData,
k=7
) {
# FUNCTION ARGUMENTS:
# df: the initial data frame for conversion
# mapVars: named vector of variables to be converted 'original name'='converted name'
# k: the rolling time period to use
# Create the variables for per capita
for (origVar in names(mapVars)) {
df <- df %>%
helperPerCapita(origVar=origVar, newName=mapVars[origVar], popData=popData)
}
# Arrange the data by date in preparation for rolling aggregations
df <- df %>%
group_by(state) %>%
arrange(date)
# Create the rolling variables
for (newVar in mapVars) {
df <- df %>%
helperRollingAgg(origVar=newVar, newName=paste0(newVar, k), k=k)
}
# Return the updated data frame, ungrouped
df %>%
ungroup()
}
# Helper function to create per capita metrics
helperPerCapita <- function(df,
origVar,
newName,
byVar="state",
popVar="pop",
popData=stateData,
mult=1000000
) {
# FUNCTION ARGUMENTS:
# df: the data frame currently being processed
# origVar: the variables to be converted to per capita
# newName: the new per capita variable name
# byVar: the variable that will be merged by
# popVar: the name of the population variable in the popData file
# popData: the file containing the population data
# mult: the multiplier, so that the metric is "per mult people"
# Create the per capita variable
df %>%
inner_join(select_at(popData, vars(all_of(c(byVar, popVar)))), by=byVar) %>%
mutate(!!newName:=mult*get(origVar)/get(popVar)) %>%
select(-all_of(popVar))
}
# Helper function to create rolling aggregates
helperRollingAgg <- function(df,
origVar,
newName,
func=zoo::rollmean,
k=7,
fill=NA,
...
) {
# FUNCTION ARGUMENTS:
# df: the data frame containing the data
# origVar: the original data column name
# newName: the new variable column name
# func: the function to be applied (zoo::rollmean will be by far the most common)
# k: the periodicity (k=7 is rolling weekly data)
# fill: how to fill leading.trailing data to maintain the same vector lengths
# ...: any other arguments to be passed to func
# Create the appropriate variable
df %>%
mutate(!!newName:=func(get(origVar), k=k, fill=fill, ...))
}
The processCVData() function is applied to df3:
# STEP 3: Process the data so that it includes all requested key variables
varsFilter <- c("date", "state", "positiveIncrease", "deathIncrease",
"hospitalizedCurrently", "totalTestResultsIncrease"
)
dfFilter3 <- processCVData(df3,
varsKeep=varsFilter,
varsRename=c(positiveIncrease="cases",
deathIncrease="deaths",
hospitalizedCurrently="hosp",
totalTestResultsIncrease="tests"
)
)
##
##
## Control totals - note that validState other than TRUE will be discarded
## (na.rm=TRUE)
##
## # A tibble: 2 x 6
## validState cases deaths hosp tests n
## <lgl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 FALSE 98681 1886 103063 557060 1560
## 2 TRUE 24196745 398840 17237235 288880839 16690
glimpse(dfFilter3)
## Rows: 16,690
## Columns: 6
## $ date <date> 2021-01-21, 2021-01-21, 2021-01-21, 2021-01-21, 2021-01-21,...
## $ state <chr> "AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", ...
## $ cases <dbl> 209, 2881, 3106, 9398, 19673, 1983, 1662, 209, 748, 12602, 5...
## $ deaths <dbl> 1, 96, 55, 244, 571, 18, 48, 1, 1, 163, 111, 3, 51, -2, 138,...
## $ hosp <dbl> 58, 2478, 1160, 4580, 20408, 783, 1069, 256, 448, 7023, 5747...
## $ tests <dbl> 10845, 7587, 13041, 69803, 224393, 41760, 39098, 7161, 6275,...
The helperMakePerCapita() function is applied to dfFilter3:
# STEP 4: Convert to per capita
dfPer3 <- helperMakePerCapita(dfFilter3,
mapVars=c("cases"="cpm", "deaths"="dpm", "hosp"="hpm", "tests"="tpm"),
popData=pop_2019
)
glimpse(dfPer3)
## Rows: 16,690
## Columns: 14
## $ date <date> 2020-01-13, 2020-01-14, 2020-01-15, 2020-01-16, 2020-01-17,...
## $ state <chr> "WA", "WA", "WA", "WA", "WA", "WA", "WA", "WA", "WA", "MA", ...
## $ cases <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ hosp <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tests <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, ...
## $ cpm <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.000...
## $ dpm <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ hpm <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tpm <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.000...
## $ cpm7 <dbl> NA, NA, NA, 0.01876023, 0.01876023, 0.03752046, 0.03752046, ...
## $ dpm7 <dbl> NA, NA, NA, 0, 0, 0, 0, 0, 0, NA, 0, NA, 0, NA, 0, 0, 0, 0, ...
## $ hpm7 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tpm7 <dbl> NA, NA, NA, 0.00000000, 0.00000000, 0.00000000, 0.00000000, ...
The clusterStates() function is updated to allow for both month and year in calculating shape (since data will now be from 2020 and 2021). Addition of a custom shape function suffices:
# Updates to the clustering function
clusterStates <- function(df,
caseVar="cpm",
deathVar="dpm",
shapeFunc=lubridate::month,
minShape=NULL,
maxShape=NULL,
minDeath=0,
maxDeath=Inf,
minCase=0,
maxCase=Inf,
ratioTotalvsShape=1,
ratioDeathvsCase=1,
hierarchical=TRUE,
hierMethod="complete",
nCenters=3,
iter.max=10,
nstart=1,
testCenters=NULL,
returnList=FALSE,
hmlSegs=3,
eslSegs=2,
seed=NULL
) {
# FUNCTION ARGUMENTS:
# df: the data frame containing cases and deaths data
# caseVar: the variable containing the cases per capita data
# deathVar: the variable containing the deaths per capita data
# shapeFunc: the function to be used for creating the shape of the curve
# minShape: the minimum value to be used for shape (to avoid very small amounts of data in Jan/Feb/Mar)
# shape is the month, so 4 means start with April data (NULL means keep everything)
# maxShape: the maximum value to be used for shape (to avoid very small amounts of data in a partial month)
# shape is the month, so 9 means end with September data (NULL means keep everything)
# minDeath: use this value as a floor for the death metric when calculating shape
# maxDeath: use this value as a maximum when calculating distance using deaths
# minCase: use this value as a floor for the case metric when calculating shape
# maxCase: use this value as a maximum when calculating distance using cases
# ratioTotalvsShape: amount of standard deviation to be kept in total variable vs shape variables
# ratioDeathvsCase: amount of standard deviation to be kept in deaths vs cases
# (total death data will be scaled to have sd this many times higher than cases)
# (death percentages by time period will be scaled directly by this amount)
# hierarchical: whether to create hierarchical clusters
# TRUE means run hierarchical clustering
# FALSE means run kmeans clustering
# NA means run rules-based clustering
# hierMethod: the method for hierarchical clustering (e.g., 'complete' or 'single')
# nCenters: the number of centers to use for kmeans clustering
# testCenters: integer vector of centers to test (will create an elbow plot); NULL means do not test
# iter.max: maximumum number of kmeans iterations (default in kmeans algorithm is 10)
# nstart: number of random sets chosen for kmeans (default in kmeans algorithm is 1)
# returnList: boolean, if FALSE just the cluster object is returned
# if TRUE, a list is returned with dfCluster and the cluster object
# hmlSegs: number of segments to create for volume of burden integrated over time
# eslSegs: number of segments to create for shape of burden over time
# seed: set the seed to this value (NULL means no seed)
# Extract key information (aggregates and by shapeFunc for each state)
df <- df %>%
select_at(vars(all_of(c("date", "state", caseVar, deathVar)))) %>%
purrr::set_names(c("date", "state", "cases", "deaths")) %>%
mutate(timeBucket=shapeFunc(date)) %>%
group_by(state, timeBucket) %>%
summarize(cases=sum(cases), deaths=sum(deaths), .groups="drop") %>%
ungroup()
# Limit to only relevant time buckets if requested
if (!is.null(minShape)) {
df <- df %>%
filter(timeBucket >= minShape)
}
if (!is.null(maxShape)) {
df <- df %>%
filter(timeBucket <= maxShape)
}
# Extract an aggregate by state, scaled so that they have the proper ratio
dfAgg <- df %>%
group_by(state) %>%
summarize(totalCases=sum(cases), totalDeaths=sum(deaths), .groups="drop") %>%
mutate(totalCases=pmin(totalCases, maxCase), totalDeaths=pmin(totalDeaths, maxDeath)) %>%
ungroup() %>%
mutate(totalDeaths=ratioDeathvsCase*totalDeaths*sd(totalCases)/sd(totalDeaths))
# Extract the percentages (shapes) by month, scaled so that they have the proper ratio
dfShape <- df %>%
pivot_longer(-c(state, timeBucket)) %>%
group_by(state, name) %>%
mutate(tot=pmax(sum(value), ifelse(name=="deaths", minDeath, minCase)),
value=ifelse(name=="deaths", ratioDeathvsCase, 1) * value / tot) %>%
select(-tot) %>%
pivot_wider(state, names_from=c(name, timeBucket), values_from=value) %>%
ungroup()
# Function to calculate SD of a subset of columns
calcSumSD <- function(df) {
df %>%
ungroup() %>%
select(-state) %>%
summarize_all(.funs=sd) %>%
as.vector() %>%
sum()
}
# Down-weight the aggregate data so that there is the proper sum of sd in aggregates and shapes
aggSD <- calcSumSD(dfAgg)
shapeSD <- calcSumSD(dfShape)
dfAgg <- dfAgg %>%
mutate_if(is.numeric, ~. * ratioTotalvsShape * shapeSD / aggSD)
# Combine so there is one row per state
dfCluster <- dfAgg %>%
inner_join(dfShape, by="state")
# convert 'state' to rowname
keyData <- dfCluster %>%
column_to_rownames("state")
# Create rules-based segments (NA) or hierarchical segments (TRUE) or kmeans segments (FALSE)
if (is.na(hierarchical)) {
# Create pseudo-rules-based segments
if (!is.null(seed)) set.seed(seed)
# STEP 1: Classify high-medium-low based on deaths and cases
hml <- kmeans(select(keyData, starts_with("total")),
centers=hmlSegs, iter.max=iter.max, nstart=nstart
)
# STEP 2: Classify early-late based on shape
esl <- kmeans(select(keyData, -starts_with("total")),
centers=eslSegs, iter.max=iter.max, nstart=nstart
)
# STEP 3: Create a final segment
objCluster <- eslSegs*(hml$cluster-1) + esl$cluster
} else if (isTRUE(hierarchical)) {
# Create hierarchical segments
objCluster <- hclust(dist(keyData), method=hierMethod)
plot(objCluster)
} else {
# Create k-means segments
# Create an elbow plot if testCenters is not NULL
if (!is.null(testCenters)) {
helperElbow(keyData, testCenters=testCenters, iter.max=iter.max, nstart=nstart, silhouette=TRUE)
}
# Create the kmeans cluster object, setting a seed if requested
if (!is.null(seed)) set.seed(seed)
objCluster <- kmeans(keyData, centers=nCenters, iter.max=iter.max, nstart=nstart)
cat("\nCluster means and counts\n")
n=objCluster$size %>% cbind(objCluster$centers) %>% round(2) %>% t() %>% print()
}
# Return the data and object is a list if returnList is TRUE, otherwise return only the clustering object
if (!isTRUE(returnList)) {
objCluster
} else {
list(objCluster=objCluster, dfCluster=dfCluster)
}
}
# Function to create an elbow plot for various numbers of clusters in the data
helperElbow <- function(mtx,
testCenters,
iter.max,
nstart,
silhouette=FALSE
) {
# FUNCTION ARGUMENTS:
# mtx: a numeric matrix, or an object that can be coerced to a numeric matrix (no character fields)
# testCenters: integer vector for the centers to be tested
# iter.max: parameter passed to kmeans
# nstart: parameter passed to kmeans
# silhouette: whether to calculate the silhouette score
# Create an object for storing tot.withinss and silhouetteScore
totWithin <- vector("numeric", length(testCenters))
silhouetteScore <- vector("numeric", length(testCenters))
# Create the distancing data (required for silhouette score)
if (silhouette) distData <- dist(mtx)
# Run k-means for every value in testCenters, and store $tot.withinss (and silhouetteScore, if requested)
n <- 1
for (k in testCenters) {
km <- kmeans(mtx, centers=k, iter.max=iter.max, nstart=nstart)
totWithin[n] <- km$tot.withinss
if (silhouette & (k > 1)) silhouetteScore[n] <- mean(cluster::silhouette(km$cluster, distData)[, 3])
n <- n + 1
}
# Create the elbow plot
p1 <- tibble::tibble(n=testCenters, wss=totWithin) %>%
ggplot(aes(x=n, y=wss)) +
geom_point() +
geom_line() +
geom_text(aes(y=wss + 0.05*max(totWithin), x=n+0.2, label=round(wss, 1))) +
labs(x="Number of segments", y="Total Within Sum-Squares", title="Elbow plot") +
ylim(c(0, NA))
# Create the silhouette plot if requested
if (silhouette) {
p2 <- tibble::tibble(n=testCenters, ss=silhouetteScore) %>%
ggplot(aes(x=n, y=ss)) +
geom_point() +
geom_line() +
geom_text(aes(y=ss + 0.05*max(silhouetteScore), x=n+0.2, label=round(ss, 1))) +
labs(x="Number of segments", y="Mean silhouette width", title="Silhouette plot") +
ylim(c(-1, NA))
gridExtra::grid.arrange(p1, p2, nrow=1)
} else {
print(p1)
}
}
The shapefunc capability appears capable of working if a custom function is used:
customTimeBucket <- function(x) {
paste0(lubridate::year(x), "-", stringr::str_pad(lubridate::month(x), width=2, side="left", pad="0"))
}
# Example for rules-based clustering
clRules3 <- clusterStates(df=dfPer3,
hierarchical=NA,
returnList=TRUE,
shapeFunc=customTimeBucket,
minShape="2020-04",
maxShape="2021-01",
hmlSegs=3,
eslSegs=3,
seed=2101231503
)
# Example for kmeans clustering (elbow plot)
clKMeans3 <- clusterStates(df=dfPer3,
hierarchical=FALSE,
returnList=TRUE,
shapeFunc=customTimeBucket,
minShape="2020-04",
maxShape="2021-01",
iter.max=50,
nstart=25,
testCenters=1:20,
seed=2101231503
)
##
## Cluster means and counts
## 1 2 3
## . 16.00 28.00 7.00
## totalCases 1.90 1.73 0.72
## totalDeaths 1.80 1.06 0.47
## cases_2020-04 0.06 0.02 0.03
## deaths_2020-04 0.15 0.07 0.10
## cases_2020-05 0.04 0.03 0.03
## deaths_2020-05 0.13 0.07 0.08
## cases_2020-06 0.03 0.03 0.03
## deaths_2020-06 0.06 0.04 0.05
## cases_2020-07 0.05 0.07 0.04
## deaths_2020-07 0.05 0.06 0.04
## cases_2020-08 0.04 0.06 0.07
## deaths_2020-08 0.05 0.07 0.06
## cases_2020-09 0.05 0.06 0.05
## deaths_2020-09 0.04 0.06 0.06
## cases_2020-10 0.10 0.10 0.07
## deaths_2020-10 0.06 0.08 0.07
## cases_2020-11 0.22 0.22 0.17
## deaths_2020-11 0.12 0.13 0.08
## cases_2020-12 0.25 0.25 0.28
## deaths_2020-12 0.21 0.24 0.25
## cases_2021-01 0.17 0.16 0.23
## deaths_2021-01 0.14 0.17 0.20
# Example for kmeans clustering (clusters)
clKMeans3 <- clusterStates(df=dfPer3,
hierarchical=FALSE,
returnList=TRUE,
shapeFunc=customTimeBucket,
minShape="2020-04",
maxShape="2021-01",
nCenters=6,
iter.max=50,
nstart=25,
seed=2101231503
)
##
## Cluster means and counts
## 1 2 3 4 5 6
## . 7.00 14.00 10.00 3.00 6.00 11.00
## totalCases 1.57 1.45 1.93 2.57 0.64 1.95
## totalDeaths 1.94 1.08 0.86 1.97 0.43 1.44
## cases_2020-04 0.10 0.03 0.01 0.04 0.03 0.02
## deaths_2020-04 0.27 0.11 0.04 0.05 0.11 0.05
## cases_2020-05 0.05 0.04 0.02 0.03 0.02 0.03
## deaths_2020-05 0.18 0.11 0.04 0.09 0.07 0.06
## cases_2020-06 0.03 0.03 0.02 0.01 0.03 0.04
## deaths_2020-06 0.06 0.05 0.03 0.05 0.04 0.04
## cases_2020-07 0.05 0.08 0.05 0.02 0.04 0.08
## deaths_2020-07 0.05 0.06 0.04 0.02 0.04 0.07
## cases_2020-08 0.04 0.06 0.05 0.04 0.07 0.06
## deaths_2020-08 0.05 0.08 0.05 0.02 0.05 0.08
## cases_2020-09 0.03 0.05 0.06 0.07 0.05 0.06
## deaths_2020-09 0.03 0.07 0.05 0.05 0.06 0.06
## cases_2020-10 0.06 0.07 0.14 0.18 0.07 0.09
## deaths_2020-10 0.03 0.07 0.10 0.12 0.07 0.07
## cases_2020-11 0.16 0.18 0.29 0.31 0.18 0.21
## deaths_2020-11 0.06 0.10 0.19 0.23 0.08 0.12
## cases_2020-12 0.28 0.26 0.23 0.20 0.28 0.25
## deaths_2020-12 0.15 0.20 0.29 0.26 0.27 0.25
## cases_2021-01 0.20 0.19 0.13 0.10 0.23 0.17
## deaths_2021-01 0.13 0.16 0.16 0.11 0.21 0.19
# Example for hierarchical clustering (clusters)
clHier3 <- clusterStates(df=dfPer3,
hierarchical=TRUE,
returnList=FALSE,
shapeFunc=customTimeBucket,
minShape="2020-04",
maxShape="2021-01"
)
The assessClusters() function is then copied with a small number of minor edits made:
assessClusters <- function(clusters,
dfState=stateData,
dfBurden=cvFilteredPerCapita,
thruLabel="Aug 20, 2020",
isCounty=FALSE,
plotsTogether=FALSE,
clusterPlotsTogether=plotsTogether,
recentTotalTogether=plotsTogether,
clusterAggTogether=plotsTogether,
makeSummaryPlots=TRUE,
makeTotalvsPerCapitaPlots=!isCounty,
makeRecentvsTotalPlots=TRUE,
makeTotalvsElementPlots=TRUE,
showMap=!isCounty,
orderCluster=FALSE
) {
# FUNCTION ARGUMENTS:
# clusters: the named vector containing the clusters by state
# dfState: the file containing the states and populations
# dfBurden: the data containing the relevant per capita burden statistics by state-date
# thruLabel: label for plots for 'data through'
# isCounty: boolean, is this a plot of county-level data that have been named 'state'?
# FALSE means that it is normal state-level data
# plotsTogether: boolean, should plots be consolidated on fewer pages?
# clusterPlotsTogether: boolean, should plots p1-p4 be consolidated?
# recentTotalTogether: boolean, should recent total plots p7-p8 be consolidated?
# clusterAggTogether: boolean, should aggregate plots p9/p11 and p10/p12 be consolidated?
# makeSummaryPlots: boolean, should the summary plots be made?
# makeTotalvsPerCapitaPlots: boolean, should the total and per capita plots be produced?
# makeRecentvsTotalPlots: boolean, should the recent vs. total plots be created?
# makeTotalvsElementPlots: boolean, should the total vs. element plots be created?
# showMap: boolean, whether to create a map colored by cluster (will show segment counts otherwise)
# orderCluster: if FALSE, ignore; if TRUE, order by "dpm"; if anything else, order by orderCluster
# ...: any additional arguments passed from a calling function
# most common would be orderCluster=TRUE, a request for the clusters to be ordered by disease burden
# Attach the clusters to the state population data
dfState <- as.data.frame(clusters) %>%
set_names("cluster") %>%
rownames_to_column("state") %>%
inner_join(dfState, by="state") %>%
mutate(cluster=factor(cluster))
# Plot the rolling 7-day mean dialy disease burden by cluster
dfPlot <- dfState %>%
inner_join(dfBurden, by="state") %>%
tibble::as_tibble()
# Reorder the clusters if requested
if (!isFALSE(orderCluster)) {
if (isTRUE(orderCluster)) burdenParam <- "dpm" else burdenParam <- orderCluster
dfPlot <- changeOrderLabel(dfPlot, grpVars="state", burdenVar=burdenParam)
}
# Call the helper function to make the overall summary statistics
if (makeSummaryPlots) {
helperClusterSummaryPlots(dfState=dfState,
dfPlot=dfPlot,
showMap=showMap,
clusterPlotsTogether=clusterPlotsTogether,
mapLevel=if(isCounty) "counties" else "states"
)
}
# These are primarily useful for state-level data and default to FALSE when isCounty is TRUE
if (makeTotalvsPerCapitaPlots) {
helperCallTotalPerCapita(dfPlot=dfPlot, thruLabel=thruLabel)
}
# Call the helper function to make recent vs. total plots
if (makeRecentvsTotalPlots) {
helperCallRecentvsTotal(dfPlot=dfPlot,
thruLabel=thruLabel,
labelPoints=!isCounty,
recentTotalTogether = recentTotalTogether
)
}
# Call the total vs. elements helper function
if (makeTotalvsElementPlots) {
helperCallTotalvsElements(dfPlot=dfPlot,
aggAndTotal=!isCounty,
clusterAggTogether=clusterPlotsTogether
)
}
# Return the plotting data frame
dfPlot
}
# Function to reorder and relabel factors
changeOrderLabel <- function(df,
fctVar="cluster",
grpVars=c(),
burdenVar="dpm",
wgtVar="pop",
exclfct="999"
) {
# FUNCTION ARGUMENTS
# df: the data frame
# fctVar: the factor variable
# grpVars: the variable that the data are aurrently at (e.g., "fipsCounty" for county-level in df)
# burdenVar: the disease burden variable for sorting
# wgtVar: the weight variable for sorting
# exclfct: the factor level to be excluded from analysis
# General approach
# 1. Data are aggregated to c(fctVar, grpVars) as x=sum(burdenVar*wgtVar) and y=mean(wgtVar)
# 2. Data are aggregated to fctVar as z=sum(x)/sum(y)
# 3. Factors are reordered from high to low on z, with the excluded factor added back last (if it exists)
# Check if exclfct exists in the factor variable
fctDummy <- exclfct %in% levels(df[, fctVar, drop=TRUE])
# Create the summary of impact by segment
newLevels <- df %>%
filter(get(fctVar) != exclfct) %>%
group_by_at(vars(all_of(c(fctVar, grpVars)))) %>%
summarize(x=sum(get(burdenVar)*get(wgtVar)), y=mean(get(wgtVar)), .groups="drop") %>%
group_by_at(vars(all_of(fctVar))) %>%
summarize(z=sum(x)/sum(y), .groups="drop") %>%
arrange(-z) %>%
pull(fctVar) %>%
as.character()
# Add back the dummy factor at the end (if it exists)
if (fctDummy) newLevels <- c(newLevels, exclfct)
# Reassign the levels in df
df %>%
mutate(!!fctVar:=factor(get(fctVar), levels=newLevels, labels=newLevels))
}
# Helper function to make the overall cluster summary statistics
helperClusterSummaryPlots <- function(dfState,
dfPlot,
showMap,
clusterPlotsTogether,
weightedMean=TRUE,
mapLevel="states"
) {
# FUNCTION ARGUMENTS:
# dfState: contains the state/county-level data
# dfPlot: contains the joined data for plotting
# showMap: boolean for whether to create a map (if FALSE, segment membership counts are shown instead)
# clusterPlotsTogether: boolean, whether to put all four plots on the same page
# weightedMean: boolean, whether to create weighted mean by segment (if FALSE, median by segment is taken)
# mapLevel: the level to be used on the map
# Reorder the cluster levels in dfState to match dfPlot
# Convert factor order to match dfPlot (can be reordered by argument to the calling function)
dfState <- dfState %>%
mutate(cluster=factor(cluster, levels=levels(dfPlot$cluster)))
# Plot the segments on a map or show segment membership
if (showMap) {
if (mapLevel=="counties") {
dfMap <- dfState %>%
mutate(fips=stringr::str_pad(state, width=5, side="left", pad="0")) %>%
select(fips, cluster)
} else {
dfMap <- dfState
}
# Create the map
p1 <- usmap::plot_usmap(regions=mapLevel, data=dfMap, values="cluster") +
scale_fill_discrete("cluster") +
theme(legend.position="right")
} else {
p1 <- dfState %>%
count(cluster) %>%
ggplot(aes(x=fct_rev(cluster), y=n)) +
geom_col(aes(fill=cluster)) +
geom_text(aes(y=n/2, label=n)) +
coord_flip() +
labs(x="", y="# Counties", title="Membership by segment")
}
# Plot the population totals by segment
# Show population totals by cluster
p2 <- dfState %>%
group_by(cluster) %>%
summarize(pop=sum(pop)/1000000, .groups="drop") %>%
ggplot(aes(x=fct_rev(cluster), y=pop)) +
geom_col(aes(fill=cluster)) +
geom_text(aes(y=pop/2, label=round(pop))) +
labs(y="Population (millions)", x="Cluster", title="Population by cluster (millions)") +
coord_flip()
# Plot the rolling 7-day mean daily disease burden by cluster
# Create the p3Data to be either median of all elements in cluster or weighted mean
p3 <- dfPlot %>%
select(date, cluster, cases=cpm7, deaths=dpm7, pop) %>%
pivot_longer((-c(date, cluster, pop))) %>%
filter(!is.na(value)) %>%
group_by(date, cluster, name) %>%
summarize(mdnValue=median(value), wtdValue=sum(pop*value)/sum(pop), .groups="drop") %>%
ggplot(aes(x=date, y=if(weightedMean) wtdValue else mdnValue)) +
geom_line(aes(group=cluster, color=cluster)) +
facet_wrap(~name, scales="free_y") +
labs(x="",
y="Rolling 7-day mean, per million",
title="Rolling 7-day mean daily disease burden, per million",
subtitle=paste0(if(weightedMean) "Weighted mean" else "Median",
" per day for states assigned to cluster"
)
) +
scale_x_date(date_breaks="1 months", date_labels="%b")
# Plot the total cases and total deaths by cluster
p4 <- dfPlot %>%
group_by(cluster) %>%
summarize(cases=sum(cases), deaths=sum(deaths), .groups="drop") %>%
pivot_longer(-cluster) %>%
ggplot(aes(x=fct_rev(cluster), y=value/1000)) +
geom_col(aes(fill=cluster)) +
geom_text(aes(y=value/2000, label=round(value/1000))) +
coord_flip() +
facet_wrap(~varMapper[name], scales="free_x") +
labs(x="Cluster", y="Burden (000s)", title="Total cases and deaths by segment")
# Place the plots together if plotsTogether is TRUE, otherwise just print
if (isTRUE(clusterPlotsTogether)) {
gridExtra::grid.arrange(p1, p2, p3, p4, nrow=2, ncol=2)
} else {
print(p1); print(p2); print(p3); print(p4)
}
}
# Helper function to make total and per capita by state (calls its own helper function)
helperCallTotalPerCapita <- function(dfPlot,
thruLabel
) {
# FUNCTION ARGUMENTS:
# dfPlot: the plotting data frame
# thruLabel: the date that data are through
# Plot total cases and total deaths by state, colored by cluster
helperBarDeathsCases(dfPlot,
numVars=c("cases", "deaths"),
title=paste0("Coronavirus impact by state through ", thruLabel),
xVar=c("state"),
fillVar=c("cluster")
)
# Plot cases per million and deaths per million by state, colored by cluster
helperBarDeathsCases(dfPlot,
numVars=c("cpm", "dpm"),
title=paste0("Coronavirus impact by state through ", thruLabel),
xVar=c("state"),
fillVar=c("cluster")
)
}
# Helper function to make recent vs. total plots
helperCallRecentvsTotal <- function(dfPlot,
thruLabel,
labelPoints,
recentTotalTogether
) {
# FUNCTION ARGUMENTS:
# dfPlot: the plotting data frame
# thruLabel: the date that data are through
# labelPoints: boolean, whether to label the individual points
# recentTotalTogether: boolean, whether to put these plots together on one page
# Plot last-30 vs total for cases per million by state, colored by cluster
p7 <- helperRecentvsTotal(dfPlot,
xVar="cpm",
yVar="newcpm",
title=paste0("Coronavirus burden through ", thruLabel),
labelPlot=labelPoints,
printPlot=FALSE
)
# Plot last-30 vs total for deaths per million by state, colored by cluster
p8 <- helperRecentvsTotal(dfPlot,
xVar="dpm",
yVar="newdpm",
title=paste0("Coronavirus burden through ", thruLabel),
labelPlot=labelPoints,
printPlot=FALSE
)
# Print the plots either as a single page or separately
if (isTRUE(recentTotalTogether)) {
gridExtra::grid.arrange(p7, p8, nrow=1)
} else {
print(p7); print(p8)
}
}
# Helper function to create total vs. elements plots
helperCallTotalvsElements <- function(dfPlot,
aggAndTotal,
clusterAggTogether,
...
) {
# FUNCTION ARGUMENTS:
# dfPlot: the plotting data frame
# aggAndTotal: boolean, should each individual line be plotted (if FALSE an 80% CI is plotted instead)
# clusterAggTogether: boolean, whether to print the plots all on a single page
# ...: any other arguments to pass to helperTotalvsElements (especially pctRibbon or aggFunc)
# Plot the cases per million on a free y-scale and a fixed y-scale
p9 <- helperTotalvsElements(dfPlot,
keyVar="cpm7",
aggAndTotal=aggAndTotal,
title="Cases per million, 7-day rolling mean",
printPlot=FALSE,
...
)
p10 <- helperTotalvsElements(dfPlot,
keyVar="cpm7",
aggAndTotal=aggAndTotal,
title="Cases per million, 7-day rolling mean",
facetScales="fixed",
printPlot=FALSE,
...
)
# Plot the deaths per million on a free y-scale and a fixed y-scale
p11 <- helperTotalvsElements(dfPlot,
keyVar="dpm7",
aggAndTotal=aggAndTotal,
title="Deaths per million, 7-day rolling mean",
printPlot=FALSE,
...
)
p12 <- helperTotalvsElements(dfPlot,
keyVar="dpm7",
aggAndTotal=aggAndTotal,
title="Deaths per million, 7-day rolling mean",
facetScales="fixed",
printPlot=FALSE,
...
)
if (isTRUE(clusterAggTogether)) {
gridExtra::grid.arrange(p9, p11, nrow=1)
gridExtra::grid.arrange(p10, p12, nrow=1)
} else {
print(p9); print(p10); print(p11); print(p12)
}
}
# Function to create side-by-side plots for a deaths and cases metric
# Data in df will be aggregated to be unique by byVar using aggFunc
helperBarDeathsCases <- function(df,
numVars,
title="",
xVar="state",
fillVar=NULL,
aggFunc=sum,
mapper=varMapper
) {
# FUNCTION ARGUMENTS:
# df: the data frame containing the data
# numVars: the relevant numeric variables for plotting
# title: plot title, default is nothing
# xVar: the x-axis variable for plotting
# fillVar: the variable that will color the bars in the final plot (NULL means use "lightblue" for all)
# aggFunc: the aggregate function (will be applied to create data unique by byVar)
# mapper: mapping file to convert x/y variables to descriptive axes (named vector "variable"="label")
# OVERALL FUNCTION PROCESS:
# 1. Variables in numVar are aggregated by aggFunc to be unique by c(xVar, fillVar)
# 2. Data are pivoted longer
# 3. Bar charts are created, with coloring by fillVar if provided
# Create the byVar for summing
byVar <- xVar
if (!is.null(fillVar)) { byVar <- c(byVar, fillVar) }
# Process the data and create the plot
p1 <- df %>%
select_at(vars(all_of(c(byVar, numVars)))) %>%
group_by_at(vars(all_of(byVar))) %>%
summarize_all(aggFunc) %>%
pivot_longer(-all_of(byVar)) %>%
ggplot(aes(x=fct_reorder(get(xVar), value, .fun=min), y=value)) +
coord_flip() +
facet_wrap(~mapper[name], scales="free_x") +
labs(x="", y="", title=title) +
if (is.null(fillVar)) geom_col(fill="lightblue") else geom_col(aes_string(fill=fillVar))
# Print the plot
print(p1)
}
# Helper function to assess 30-day change vs. total
helperRecentvsTotal <- function(df,
xVar,
yVar,
title,
recencyDays=30,
ablineSlope=0.5,
mapper=varMapper,
labelPlot=TRUE,
printPlot=TRUE
) {
# FUNCTION ARGUMENTS:
# df: the tibble containing data by state by day
# xVar: the x-variable
# yVar: the y-variable
# title: the plot title
# recencyDays: number of days to consider as recent
# ablineSlope: dashed line will be drawn with this slope and intercept 0
# mapper: mapping file to convert x/y variables to descriptive axes (named vector "variable"="label")
# labelPlot: boolean, whether to show the labels for each point
# printPlot: boolean, whether to display the plot (if FALSE, plot object is returned)
# Get the most date cutoff
dateCutoff <- df %>% pull(date) %>% max() - recencyDays + 1
cat("\nRecency is defined as", format(dateCutoff, "%Y-%m-%d"), "through current\n")
# Create the plot
p1 <- df %>%
mutate(newCases=ifelse(date >= dateCutoff, cases, 0),
newDeaths=ifelse(date >= dateCutoff, deaths, 0),
newcpm=ifelse(date >= dateCutoff, cpm, 0),
newdpm=ifelse(date >= dateCutoff, dpm, 0)
) %>%
group_by(state, cluster) %>%
summarize_if(is.numeric, .funs=sum) %>%
ungroup() %>%
ggplot(aes_string(x=xVar, y=yVar)) +
labs(x=mapper[xVar],
y=mapper[yVar],
title=title,
subtitle=paste0("Dashed line represents ",
round(100*ablineSlope),
"% of total is new in last ",
recencyDays,
" days"
)
) +
geom_abline(lty=2, slope=ablineSlope) +
lims(x=c(0, NA), y=c(0, NA)) +
theme(legend.position = "bottom")
# Add the appropriate geom (scatterplot if labelPlot is FALSE)
if (labelPlot) p1 <- p1 + geom_text(aes(color=cluster, label=state))
else p1 <- p1 + geom_point(aes(color=cluster), alpha=0.5)
if (isTRUE(printPlot)) {
print(p1)
} else {
p1
}
}
# Function to plot cluster vs. individual elements on a key metric
helperTotalvsElements <- function(df,
keyVar,
title,
aggAndTotal=TRUE,
pctRibbon=0.8,
aggFunc=if(aggAndTotal) median else mean,
mapper=varMapper,
facetScales="free_y",
printPlot=TRUE
) {
# FUNCTION ARGUMENTS:
# df: the data frame containing n-day rolling averages
# keyVar: the variable to be plotted
# title: the plot title
# aggAndTotal: boolean, whether to plot every individual observation along with the cluster aggregate
# pctRibbon: if aggAndTotal is FALSE, a ribbon covering this percentage of the data will be plotted
# aggFunc: how to aggregate the elements to the segment
# CAUTION that this is an aggregate of averages, rather than a population-weighted aggregate
# mapper: the variable mapping file to get the appropriate label for keyVar
# facetScales: the scaling for the facets - "free_y" to let them all float, "fixed" to have them the same
# printPlot: boolean, if TRUE print the plot (otherwise return the plot object)
# Create an appropriate subtitle
subtitle <- if(facetScales=="free_y") {
"Caution that each facet has its own y axis with different scales"
} else if (facetScales=="fixed") {
"All facets are on the same scale"
} else {
"Update subtitle code in function helperTotalvsElements"
}
# Create an appropriate caption
xCap <- "1. Cluster aggregate is mean over all observations (NOT population-weighted)"
xCap <- paste0(xCap, "\n2. Ribbons reflect range covering ", round(pctRibbon*100), "% of observations")
caption <- if(aggAndTotal) {
"Cluster aggregate is median, weighting each observation equally\n(NOT population-weighted)"
} else {
xCap
}
# Create the plots for segment-level data
p1 <- df %>%
rbind(mutate(., state="cluster")) %>%
group_by(state, cluster, date) %>%
summarize_at(vars(all_of(keyVar)), .funs=aggFunc) %>%
ungroup() %>%
filter(!is.na(get(keyVar))) %>%
ggplot(aes_string(x="date")) +
geom_line(data=~filter(., state == "cluster"),
aes(y=get(keyVar), group=state, color=cluster),
lwd=1.5
) +
facet_wrap(~cluster, scales=facetScales) +
labs(x="",
y=mapper[keyVar],
title=title,
subtitle=subtitle,
caption=caption
) +
ylim(c(0, NA)) +
theme(legend.position="bottom")
# Add an appropriate individual metric (either every observation or quantiles)
if (aggAndTotal) {
p1 <- p1 +
geom_line(data=~filter(., state != "cluster"),
aes(y=get(keyVar), group=state),
alpha=0.25
)
} else {
dfRibbon <- df %>%
filter(!is.na(get(keyVar))) %>%
group_by(cluster, date) %>%
summarize(ylow=quantile(get(keyVar), 0.5-0.5*pctRibbon),
yhigh=quantile(get(keyVar), 0.5+0.5*pctRibbon),
.groups="drop"
)
p1 <- p1 +
geom_ribbon(data=dfRibbon,
aes(ymin=ylow, ymax=yhigh),
alpha=0.25
)
}
# Print plot if requested, otherwise return it
if (isTRUE(printPlot)) {
print(p1)
} else {
p1
}
}
The function is then run using the latest hierarchical clusters:
# Example for running assessClusters
plotDataHier3 <- assessClusters(cutree(clHier3, k=6),
dfState=pop_2019,
dfBurden=dfPer3,
thruLabel="2021-01-20",
plotsTogether=TRUE
)
##
## Recency is defined as 2020-12-23 through current
##
## Recency is defined as 2020-12-23 through current
The plotConsolidatedMetrics() function is copied:
# Function to create plots of consolidated metrics
plotConsolidatedMetrics <- function(dfMain,
dfHosp=NULL,
varMain=c("state", "cluster", "date", "pop", "cases", "deaths", "hosp"),
varDropHosp=c("n", "pop"),
joinBy=c("state", "cluster", "date"),
subT=NULL,
nrowPlot2=1
) {
# FUNCTION ARGUMENTS:
# dfMain: the main file produced by assessClusters(), containing data by state-cluster-date
# dfHosp: the separate file with hospital data (NULL means data already available in dfMain)
# varMain: variables to keep from dfMain
# varDropHosp: variables to drop from dfHosp
# joinBy: variables for joining dfMain and dfHosp
# subT: plot subtitle (NULL will use the defaults),
# nrowPlot2: number of rows for the facetting to use on plot 2
if (is.null(subT)) {
subT <- "Cases: new cases, Deaths: new deaths, Hosp: total in hospital (not new)"
}
# Filter dfMain to include only variables in varMain
dfMain <- dfMain %>%
select_at(vars(all_of(varMain)))
# Left join dfMain to dfHosp unless dfHosp is NULL
if (!is.null(dfHosp)) {
dfHosp <- dfHosp %>%
select_at(vars(all_of(names(dfHosp)[!(names(dfHosp) %in% varDropHosp)])))
dfMain <- dfMain %>%
left_join(dfHosp, by=all_of(joinBy))
}
# Check that variables state, cluster, date, pop are all available
if (!(c("state", "cluster", "date", "pop") %in% names(dfMain) %>% all())) {
stop("\nFunction requires variables state, cluster, date, and pop after processing\n")
}
# Create the relevant plotting data
dfPlot <- dfMain %>%
pivot_longer(-c(state, cluster, date, pop)) %>%
filter(!is.na(value)) %>%
rbind(mutate(., state="cluster")) %>%
group_by_at(vars(all_of(c(joinBy, "name")))) %>%
summarize(value=sum(value), pop=sum(pop), .groups="drop") %>%
mutate(vpm=1000000*value/pop) %>%
arrange(state, cluster, name, date) %>%
group_by(state, cluster, name) %>%
helperRollingAgg(origVar="vpm", newName="vpm7")
# Create facetted plots for totals by metric by segment
p1 <- dfPlot %>%
filter(!is.na(vpm7)) %>%
ggplot(aes(x=date, y=vpm7)) +
geom_line(data=~filter(., state=="cluster"), aes(group=cluster, color=cluster), lwd=1.5) +
geom_line(data=~filter(., state!="cluster"), aes(group=state), alpha=0.25) +
facet_grid(name ~ cluster, scales="free_y") +
labs(x="",
y="Rolling 7-day mean per million",
title="Key metrics by cluster (7-day rolling mean per million)",
subtitle=subT
) +
scale_x_date(date_breaks="1 months", date_labels="%b") +
theme(axis.text.x=element_text(angle=90))
print(p1)
# Create all-segment plot by metric
p2 <- dfPlot %>%
filter(!is.na(vpm7)) %>%
ggplot(aes(x=date, y=vpm7)) +
geom_line(data=~filter(., state=="cluster"), aes(group=cluster, color=cluster), lwd=1.5) +
facet_wrap(~ name, scales="free_y", nrow=nrowPlot2) +
labs(x="",
y="Rolling 7-day mean per million",
title="Key metrics by cluster (7-day rolling mean per million)",
subtitle=subT
) +
scale_x_date(date_breaks="1 months", date_labels="%b") +
theme(axis.text.x=element_text(angle=90))
print(p2)
# Create all-metric plot by segment (define 100% as peak for segment-metric)
p3 <- dfPlot %>%
filter(!is.na(vpm7)) %>%
group_by(state, cluster, name) %>%
mutate(spm7=vpm7/max(vpm7)) %>%
ggplot(aes(x=date, y=spm7)) +
geom_line(data=~filter(., state=="cluster"), aes(group=name, color=name), lwd=1) +
facet_wrap(~ cluster, scales="free_y") +
labs(x="",
y="% of Maximum",
title="Key metrics by cluster (% of cluster maximum for variable)",
subtitle=subT
) +
scale_x_date(date_breaks="1 months", date_labels="%b") +
scale_color_discrete("variable") +
theme(axis.text.x=element_text(angle=90))
print(p3)
# Return the plotting data
dfPlot
}
# Example for consolidatedPlotData()
subT <- "Cases: new cases, Deaths: new deaths, Hosp: total in hospital (not new), Tests: new tests"
consolidatedPlotDataHier3 <- plotConsolidatedMetrics(plotDataHier3,
varMain=c("state", "cluster", "date", "pop",
"cases", "deaths", "hosp", "tests"
),
subT=subT,
nrowPlot2=2
)
The makeCumulative() and plotCumulative() functions are copied:
# Function to convert a file to cumulative totals
makeCumulative <- function(df,
typeVar="name",
typeKeep=c("cases", "deaths", "tests"),
valVar="vpm7",
groupVars=c("state", "cluster", "name"),
arrangeVars=c("date"),
newName="cum7"
) {
# FUNCTION ARGUMENTS:
# df: data frame containing the metrics
# typeVar: the variable holding the metric type (default is 'name')
# typeKeep: the values of typeVar to be kept
# valVar: the variable holding the metric value (default is 'vpm7')
# groupVars: groups for calculating cumulative sum
# arrangeVars: variables to be sorted by before calculating cumulative sum
# newName: the name for the new variable
# Create the cumulative data
dfCum <- df %>%
filter(get(typeVar) %in% typeKeep, !is.na(get(valVar))) %>%
arrange_at(vars(all_of(c(groupVars, arrangeVars)))) %>%
group_by_at(groupVars) %>%
mutate(!!newName:=cumsum(get(valVar))) %>%
ungroup()
# Return the processed data
dfCum
}
# Function to plot cumulative data
plotCumulativeData <- function(df,
keyMetricp2,
flagsp2,
p2Desc=keyMetricp2,
keyVar="cum7",
makep1=FALSE,
makep2=TRUE
) {
# FUNCTION ARGUMENTS:
# df: the data frame of cumulative data
# keyMetricp2: the key metric to be plotted in the second plot (e.g., "deaths", "cases", "tests")
# flagsp2: states to be treated as flagged in the second plot
# p2Desc: the description to be used in plot 2
# keyVar: the key variable to plot
# makep1: boolean, whether to make the first plot
# makep2: boolean, whether to make the second plot
# Plot the cumulative data by cluster (if flag is set for producing this)
if (isTRUE(makep1)) {
p1 <- df %>%
filter(state=="cluster") %>%
ggplot(aes(x=date, y=get(keyVar))) +
geom_line(aes(group=cluster, color=cluster)) +
facet_wrap(~name, nrow=1, scales="free_y") +
scale_x_date(date_breaks="1 months", date_labels="%m") +
labs(x="Month",
y="Cumulative Burden (per million)",
title="Cumulative burden by segment (per million)"
)
print(p1)
}
# Plot the cumulative totals over time for one metric, and flag the highest
if (isTRUE(makep2)) {
p2 <- df %>%
filter(state!="cluster", name==keyMetricp2) %>%
mutate(bold=ifelse(state %in% flagsp2, 1, 0)) %>%
ggplot(aes(x=date, y=get(keyVar))) +
geom_line(aes(group=state, color=cluster, alpha=0.4+0.6*bold, size=0.5+0.5*bold)) +
geom_text(data=~filter(., bold==1, date==max(date)),
aes(x=date+lubridate::days(10),
label=paste0(state, ": ", round(get(keyVar), 0)),
color=cluster
),
size=3,
fontface="bold"
) +
scale_x_date(date_breaks="1 months", date_labels="%m") +
scale_alpha_identity() +
scale_size_identity() +
labs(x="Month",
y=paste0("Cumulative ", p2Desc, " (per million)"),
title=paste0("Cumulative coronavirus ", p2Desc, " by state (per million)"),
subtitle="Top 5 states for total and growth rate are bolded and labelled"
)
print(p2)
}
}
# Function to find and flag states that are high on a key value or change in key value
findFlagStates <- function(df,
keyMetricVal,
keyMetricVar="name",
cumVar="cum7",
prevDays=30,
nFlag=5
) {
# FUNCTION ARGUMENTS:
# df: the data frame containing the cumulative data
# keyMetricVal: the metric of interest (e.g., "deaths", "cases", "tests")
# keyMetricVar: the variable name for the column containing the metric of interest
# cumVar: variable containing the cumulative totals
# prevDays: the number of days previous to use for defining growth
# nFlag: the number of states to flag for either total or growth (top-n of each)
# Find top-5 in either total or recent increase
dfFlag <- df %>%
filter(get(keyMetricVar)==keyMetricVal, state!="cluster") %>%
select_at(vars(all_of(c("state", "date", cumVar)))) %>%
group_by(state) %>%
summarize(maxVal=max(get(cumVar)),
tminus=sum(ifelse(date==max(date)-lubridate::days(prevDays), get(cumVar), 0)),
.groups="drop"
) %>%
ungroup() %>%
mutate(growth=maxVal-tminus,
rkTotal=min_rank(-maxVal),
rkGrowth=min_rank(-growth),
flag=ifelse(pmin(rkTotal, rkGrowth)<=nFlag, 1, 0)
) %>%
arrange(-flag, rkTotal)
# Return the top values as a vector of states
dfFlag %>%
filter(flag==1) %>%
pull(state)
}
consPosHier3 <- consolidatedPlotDataHier3 %>%
ungroup() %>%
select(state, cluster, date, name, vpm7) %>%
arrange(state, cluster, date, name) %>%
pivot_wider(-vpm7, names_from="name", values_from="vpm7") %>%
mutate(pctpos=cases/tests) %>%
pivot_longer(-c(state, cluster, date), values_to="vpm7") %>%
filter(!is.na(vpm7))
clCumHier3 <- makeCumulative(consPosHier3)
plotCumulativeData(clCumHier3,
keyMetricp2="",
flagsp2="",
makep1=TRUE,
makep2=FALSE
)
plotCumulativeData(clCumHier3,
keyMetricp2="deaths",
flagsp2=findFlagStates(clCumHier3, keyMetricVal = "deaths")
)
plotCumulativeData(clCumHier3,
keyMetricp2="cases",
flagsp2=findFlagStates(clCumHier3, keyMetricVal = "cases")
)
plotCumulativeData(clCumHier3,
keyMetricp2="tests",
flagsp2=findFlagStates(clCumHier3, keyMetricVal = "tests")
)
The functions appear to all be working for 2021 data from COVID Tracking Project. Next steps are to try the integrated function on the latest data.
The integrated function is run using data downloaded on 26-JAN-2021, with a similar approach as previous:
# Create new segments with updated data
locDownload <- "./RInputFiles/Coronavirus/CV_downloaded_210126.csv"
ctp_hier7_210126 <- readRunCOVIDTrackingProject(thruLabel="Jan 25, 2021",
downloadTo=if(file.exists(locDownload)) NULL else locDownload,
readFrom=locDownload,
compareFile=readFromRDS("test_hier5_201025")$dfRaw,
hierarchical=TRUE,
reAssignState=list("HI"="ME"),
kCut=8,
minShape=3,
ratioDeathvsCase = 5,
ratioTotalvsShape = 0.25,
minDeath=100,
minCase=10000
)
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## state = col_character(),
## totalTestResultsSource = col_character(),
## dataQualityGrade = col_character(),
## lastUpdateEt = col_character(),
## dateModified = col_datetime(format = ""),
## checkTimeEt = col_character(),
## dateChecked = col_datetime(format = ""),
## fips = col_character(),
## hash = col_character(),
## grade = col_logical()
## )
## i Use `spec()` for the full column specifications.
##
## File is unique by state and date
##
##
## Overall control totals in file:
## # A tibble: 1 x 3
## positiveIncrease deathIncrease hospitalizedCurrently
## <dbl> <dbl> <dbl>
## 1 24935041 411823 17790757
##
## *** COMPARISONS TO REFERENCE FILE: compareFile
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: states
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: dates
## In reference but not in current:
## In current but not in reference: 2020-01-13 2020-01-14 2020-01-15 2020-01-16 2020-01-17 2020-01-18 2020-01-19 2020-01-20 2020-01-21 2020-10-25 2020-10-26 2020-10-27 2020-10-28 2020-10-29 2020-10-30 2020-10-31 2020-11-01 2020-11-02 2020-11-03 2020-11-04 2020-11-05 2020-11-06 2020-11-07 2020-11-08 2020-11-09 2020-11-10 2020-11-11 2020-11-12 2020-11-13 2020-11-14 2020-11-15 2020-11-16 2020-11-17 2020-11-18 2020-11-19 2020-11-20 2020-11-21 2020-11-22 2020-11-23 2020-11-24 2020-11-25 2020-11-26 2020-11-27 2020-11-28 2020-11-29 2020-11-30 2020-12-01 2020-12-02 2020-12-03 2020-12-04 2020-12-05 2020-12-06 2020-12-07 2020-12-08 2020-12-09 2020-12-10 2020-12-11 2020-12-12 2020-12-13 2020-12-14 2020-12-15 2020-12-16 2020-12-17 2020-12-18 2020-12-19 2020-12-20 2020-12-21 2020-12-22 2020-12-23 2020-12-24 2020-12-25 2020-12-26 2020-12-27 2020-12-28 2020-12-29 2020-12-30 2020-12-31 2021-01-01 2021-01-02 2021-01-03 2021-01-04 2021-01-05 2021-01-06 2021-01-07 2021-01-08 2021-01-09 2021-01-10 2021-01-11 2021-01-12 2021-01-13 2021-01-14 2021-01-15 2021-01-16 2021-01-17 2021-01-18 2021-01-19 2021-01-20 2021-01-21 2021-01-22 2021-01-23 2021-01-24 2021-01-25
##
## Difference of 5+ that is at least 1% (summed to date and metric): 190
## date name newValue oldValue
## 1 2020-02-29 positiveIncrease 3 18
## 2 2020-03-01 positiveIncrease 8 16
## 3 2020-03-02 positiveIncrease 30 44
## 4 2020-03-03 positiveIncrease 42 48
## 5 2020-03-05 positiveIncrease 63 103
## 6 2020-03-06 positiveIncrease 129 109
## 7 2020-03-07 positiveIncrease 135 176
## 8 2020-03-08 positiveIncrease 170 198
## 9 2020-03-09 positiveIncrease 276 292
## 10 2020-03-11 positiveIncrease 420 509
## 11 2020-03-12 positiveIncrease 683 671
## 12 2020-03-13 positiveIncrease 843 1072
## 13 2020-03-14 positiveIncrease 1025 924
## 14 2020-03-16 positiveIncrease 1706 1739
## 15 2020-03-17 positiveIncrease 2088 2588
## 16 2020-03-18 positiveIncrease 3357 3089
## 17 2020-03-21 positiveIncrease 6932 6793
## 18 2020-03-21 hospitalizedCurrently 1492 1436
## 19 2020-03-22 positiveIncrease 9223 9125
## 20 2020-03-23 positiveIncrease 11192 11439
## 21 2020-03-23 hospitalizedCurrently 2812 2770
## 22 2020-03-24 positiveIncrease 10882 10769
## 23 2020-03-25 positiveIncrease 12631 12908
## 24 2020-03-25 deathIncrease 241 235
## 25 2020-03-25 hospitalizedCurrently 5140 5062
## 26 2020-03-26 deathIncrease 313 319
## 27 2020-03-28 deathIncrease 551 544
## 28 2020-03-29 positiveIncrease 19664 19348
## 29 2020-03-29 deathIncrease 505 515
## 30 2020-03-30 positiveIncrease 21202 22042
## 31 2020-03-31 deathIncrease 908 890
## 32 2020-04-01 positiveIncrease 26271 25791
## 33 2020-04-05 positiveIncrease 25910 25500
## 34 2020-04-06 positiveIncrease 28243 29002
## 35 2020-04-07 positiveIncrease 30476 30885
## 36 2020-04-09 positiveIncrease 35116 34503
## 37 2020-04-10 positiveIncrease 33608 34380
## 38 2020-04-10 deathIncrease 2083 2108
## 39 2020-04-11 positiveIncrease 31263 30501
## 40 2020-04-11 deathIncrease 2075 2054
## 41 2020-04-12 positiveIncrease 28115 27784
## 42 2020-04-13 positiveIncrease 24281 25195
## 43 2020-04-15 positiveIncrease 29921 30307
## 44 2020-04-16 positiveIncrease 31527 30978
## 45 2020-04-20 deathIncrease 1816 1798
## 46 2020-04-21 positiveIncrease 26039 26367
## 47 2020-04-23 deathIncrease 1809 1791
## 48 2020-04-24 deathIncrease 1974 1895
## 49 2020-04-25 deathIncrease 1629 1748
## 50 2020-04-27 positiveIncrease 22407 22708
## 51 2020-04-27 deathIncrease 1290 1270
## 52 2020-04-29 deathIncrease 2676 2713
## 53 2020-05-01 deathIncrease 1809 1779
## 54 2020-05-02 deathIncrease 1527 1562
## 55 2020-05-03 deathIncrease 1248 1232
## 56 2020-05-04 positiveIncrease 22195 22649
## 57 2020-05-05 deathIncrease 2490 2452
## 58 2020-05-06 deathIncrease 1918 1948
## 59 2020-05-07 positiveIncrease 27229 27537
## 60 2020-05-11 positiveIncrease 18140 18377
## 61 2020-05-12 positiveIncrease 22442 22890
## 62 2020-05-12 deathIncrease 1509 1486
## 63 2020-05-13 positiveIncrease 21500 21285
## 64 2020-05-13 deathIncrease 1730 1704
## 65 2020-05-14 deathIncrease 1853 1879
## 66 2020-05-15 positiveIncrease 25490 24685
## 67 2020-05-15 deathIncrease 1538 1507
## 68 2020-05-16 positiveIncrease 23743 24702
## 69 2020-05-16 deathIncrease 1237 987
## 70 2020-05-17 positiveIncrease 20436 20009
## 71 2020-05-17 deathIncrease 873 849
## 72 2020-05-18 positiveIncrease 20597 21028
## 73 2020-05-19 positiveIncrease 20687 20897
## 74 2020-05-21 deathIncrease 1380 1394
## 75 2020-05-22 positiveIncrease 24115 24433
## 76 2020-05-22 deathIncrease 1290 1341
## 77 2020-05-23 positiveIncrease 22561 21531
## 78 2020-05-23 deathIncrease 1040 1063
## 79 2020-05-24 positiveIncrease 19062 20072
## 80 2020-05-24 deathIncrease 688 680
## 81 2020-05-26 deathIncrease 665 645
## 82 2020-05-27 positiveIncrease 19172 19447
## 83 2020-05-27 deathIncrease 1335 1321
## 84 2020-06-01 positiveIncrease 20101 20485
## 85 2020-06-01 deathIncrease 679 668
## 86 2020-06-02 positiveIncrease 19879 20109
## 87 2020-06-03 positiveIncrease 20182 20390
## 88 2020-06-03 deathIncrease 975 993
## 89 2020-06-04 positiveIncrease 20477 20886
## 90 2020-06-04 deathIncrease 883 893
## 91 2020-06-05 positiveIncrease 23050 23394
## 92 2020-06-05 deathIncrease 835 826
## 93 2020-06-06 positiveIncrease 22746 23064
## 94 2020-06-06 deathIncrease 714 728
## 95 2020-06-07 positiveIncrease 19056 18740
## 96 2020-06-08 positiveIncrease 16923 17209
## 97 2020-06-08 deathIncrease 675 661
## 98 2020-06-09 positiveIncrease 16916 17312
## 99 2020-06-09 deathIncrease 886 902
## 100 2020-06-12 positiveIncrease 23141 23597
## 101 2020-06-12 deathIncrease 766 775
## 102 2020-06-14 positiveIncrease 21658 21399
## 103 2020-06-15 positiveIncrease 18255 18649
## 104 2020-06-16 positiveIncrease 22838 23478
## 105 2020-06-16 deathIncrease 720 730
## 106 2020-06-17 deathIncrease 780 767
## 107 2020-06-18 positiveIncrease 27042 27746
## 108 2020-06-18 deathIncrease 682 705
## 109 2020-06-19 positiveIncrease 30865 31471
## 110 2020-06-20 deathIncrease 615 629
## 111 2020-06-21 positiveIncrease 29188 27928
## 112 2020-06-22 positiveIncrease 26829 27281
## 113 2020-06-23 deathIncrease 724 710
## 114 2020-06-24 deathIncrease 704 724
## 115 2020-06-26 deathIncrease 618 637
## 116 2020-06-29 positiveIncrease 39398 39813
## 117 2020-06-30 positiveIncrease 47010 47864
## 118 2020-06-30 deathIncrease 579 596
## 119 2020-07-02 positiveIncrease 53511 54085
## 120 2020-07-04 deathIncrease 296 306
## 121 2020-07-06 positiveIncrease 40925 41959
## 122 2020-07-06 deathIncrease 237 243
## 123 2020-07-07 positiveIncrease 50990 51687
## 124 2020-07-07 deathIncrease 905 923
## 125 2020-07-08 deathIncrease 819 807
## 126 2020-07-10 deathIncrease 839 854
## 127 2020-07-13 positiveIncrease 57160 58133
## 128 2020-07-14 positiveIncrease 58609 62687
## 129 2020-07-15 positiveIncrease 69373 65797
## 130 2020-07-17 deathIncrease 939 951
## 131 2020-07-20 deathIncrease 376 363
## 132 2020-07-21 positiveIncrease 62920 63930
## 133 2020-07-22 deathIncrease 1149 1171
## 134 2020-07-23 deathIncrease 1070 1056
## 135 2020-07-27 positiveIncrease 54479 55332
## 136 2020-07-31 deathIncrease 1328 1312
## 137 2020-08-02 positiveIncrease 53301 46812
## 138 2020-08-03 positiveIncrease 42738 49713
## 139 2020-08-04 positiveIncrease 51198 51866
## 140 2020-08-04 deathIncrease 1241 1255
## 141 2020-08-10 positiveIncrease 41370 42089
## 142 2020-08-11 positiveIncrease 54935 55701
## 143 2020-08-14 positiveIncrease 57101 55636
## 144 2020-08-18 positiveIncrease 40070 40795
## 145 2020-08-18 deathIncrease 1179 1196
## 146 2020-08-25 positiveIncrease 36839 36379
## 147 2020-08-29 positiveIncrease 43995 44501
## 148 2020-08-30 positiveIncrease 38766 39501
## 149 2020-08-31 deathIncrease 380 366
## 150 2020-09-01 deathIncrease 1014 1027
## 151 2020-09-07 positiveIncrease 28117 28682
## 152 2020-09-08 deathIncrease 347 358
## 153 2020-09-09 positiveIncrease 30733 31114
## 154 2020-09-15 positiveIncrease 34778 35445
## 155 2020-09-17 deathIncrease 880 863
## 156 2020-09-18 positiveIncrease 46889 47486
## 157 2020-09-20 positiveIncrease 35533 36295
## 158 2020-09-21 deathIncrease 281 287
## 159 2020-09-23 positiveIncrease 39498 38567
## 160 2020-09-24 deathIncrease 938 921
## 161 2020-09-26 positiveIncrease 47268 47856
## 162 2020-09-27 positiveIncrease 34990 35454
## 163 2020-09-28 positiveIncrease 35376 36524
## 164 2020-09-28 deathIncrease 246 257
## 165 2020-09-29 deathIncrease 724 739
## 166 2020-09-30 positiveIncrease 44909 44424
## 167 2020-10-01 deathIncrease 862 851
## 168 2020-10-04 deathIncrease 380 363
## 169 2020-10-05 positiveIncrease 37752 38133
## 170 2020-10-05 deathIncrease 331 326
## 171 2020-10-06 deathIncrease 613 634
## 172 2020-10-07 positiveIncrease 51216 50602
## 173 2020-10-07 deathIncrease 929 916
## 174 2020-10-09 deathIncrease 913 893
## 175 2020-10-10 deathIncrease 691 665
## 176 2020-10-11 deathIncrease 471 466
## 177 2020-10-13 positiveIncrease 46979 48387
## 178 2020-10-13 deathIncrease 718 690
## 179 2020-10-14 deathIncrease 801 811
## 180 2020-10-15 deathIncrease 928 951
## 181 2020-10-16 deathIncrease 891 877
## 182 2020-10-18 positiveIncrease 47957 48922
## 183 2020-10-18 deathIncrease 405 393
## 184 2020-10-19 deathIncrease 443 456
## 185 2020-10-21 positiveIncrease 61710 58606
## 186 2020-10-22 positiveIncrease 73419 75248
## 187 2020-10-22 deathIncrease 1117 1143
## 188 2020-10-23 deathIncrease 949 916
## 189 2020-10-24 positiveIncrease 83792 82668
## 190 2020-10-24 deathIncrease 896 885
## Warning: Removed 102 row(s) containing missing values (geom_path).
##
##
## Difference of 5+ that is at least 1% (summed to state and metric): 12
## state name newValue oldValue
## 1 AK positiveIncrease 12523 13535
## 2 CO positiveIncrease 93398 91570
## 3 CO deathIncrease 2218 2076
## 4 FL positiveIncrease 766305 776249
## 5 ND deathIncrease 453 345
## 6 NJ positiveIncrease 236393 227339
## 7 NM positiveIncrease 41040 40168
## 8 NM hospitalizedCurrently 27399 27120
## 9 PR positiveIncrease 31067 61275
## 10 RI positiveIncrease 30581 30116
## 11 WA positiveIncrease 105278 101345
## 12 WA hospitalizedCurrently 92643 69716
## Rows: 18,474
## Columns: 55
## $ date <date> 2021-01-25, 2021-01-25, 2021-01-25, 20...
## $ state <chr> "AK", "AL", "AR", "AS", "AZ", "CA", "CO...
## $ positive <dbl> 51693, 443009, 284702, 0, 727895, 31361...
## $ probableCases <dbl> NA, 92021, 56292, NA, 44579, NA, 18539,...
## $ negative <dbl> 1405360, 1740023, 2128124, 2140, 268287...
## $ pending <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ totalTestResultsSource <chr> "totalTestsViral", "totalTestsPeopleVir...
## $ totalTestResults <dbl> 1457053, 2091011, 2356534, 2140, 642979...
## $ hospitalizedCurrently <dbl> 54, 2285, 1084, NA, 4229, 18347, 737, 1...
## $ hospitalizedCumulative <dbl> 1185, 41069, 13312, NA, 50483, NA, 2126...
## $ inIcuCurrently <dbl> NA, NA, 336, NA, 1027, 4475, NA, NA, 67...
## $ inIcuCumulative <dbl> NA, 2511, NA, NA, NA, NA, NA, NA, NA, N...
## $ onVentilatorCurrently <dbl> 7, NA, 187, NA, 702, NA, NA, NA, 39, NA...
## $ onVentilatorCumulative <dbl> NA, 1433, 1395, NA, NA, NA, NA, NA, NA,...
## $ recovered <dbl> NA, 233211, 262229, NA, 98779, NA, 2056...
## $ dataQualityGrade <chr> "B", "A", "A+", "N/A", "A+", "B", "A", ...
## $ lastUpdateEt <chr> "1/25/2021 03:59", "1/25/2021 11:00", "...
## $ dateModified <dttm> 2021-01-25 03:59:00, 2021-01-25 11:00:...
## $ checkTimeEt <chr> "01/24 22:59", "01/25 06:00", "01/24 19...
## $ death <dbl> 259, 6662, 4650, 0, 12239, 37118, 5512,...
## $ hospitalized <dbl> 1185, 41069, 13312, NA, 50483, NA, 2126...
## $ dateChecked <dttm> 2021-01-25 03:59:00, 2021-01-25 11:00:...
## $ totalTestsViral <dbl> 1457053, NA, 2356534, 2140, 6429799, 40...
## $ positiveTestsViral <dbl> 62017, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ negativeTestsViral <dbl> 1393392, NA, 2128124, NA, NA, NA, NA, N...
## $ positiveCasesViral <dbl> NA, 350988, 228410, 0, 683316, 3136158,...
## $ deathConfirmed <dbl> NA, 5469, 3778, NA, 10942, NA, 4801, 56...
## $ deathProbable <dbl> NA, 1193, 872, NA, 1297, NA, 704, 1290,...
## $ totalTestEncountersViral <dbl> NA, NA, NA, NA, NA, NA, 5263473, NA, 10...
## $ totalTestsPeopleViral <dbl> NA, 2091011, NA, NA, 3366193, NA, 23561...
## $ totalTestsAntibody <dbl> NA, NA, NA, NA, 413052, NA, 334821, NA,...
## $ positiveTestsAntibody <dbl> NA, NA, NA, NA, NA, NA, 42087, NA, NA, ...
## $ negativeTestsAntibody <dbl> NA, NA, NA, NA, NA, NA, 290389, NA, NA,...
## $ totalTestsPeopleAntibody <dbl> NA, 101423, NA, NA, NA, NA, NA, NA, NA,...
## $ positiveTestsPeopleAntibody <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ negativeTestsPeopleAntibody <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ totalTestsPeopleAntigen <dbl> NA, NA, 339424, NA, NA, NA, NA, NA, NA,...
## $ positiveTestsPeopleAntigen <dbl> NA, NA, 66176, NA, NA, NA, NA, NA, NA, ...
## $ totalTestsAntigen <dbl> NA, NA, NA, NA, NA, NA, NA, 216653, NA,...
## $ positiveTestsAntigen <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ fips <chr> "02", "01", "05", "60", "04", "06", "08...
## $ positiveIncrease <dbl> 83, 1839, 636, 0, 5321, 27007, 1177, 58...
## $ negativeIncrease <dbl> 3500, 2929, 8564, 0, 11106, 376186, 0, ...
## $ total <dbl> 1457053, 2183032, 2412826, 2140, 341077...
## $ totalTestResultsIncrease <dbl> 3583, 4456, 9146, 0, 44580, 403193, 165...
## $ posNeg <dbl> 1457053, 2183032, 2412826, 2140, 341077...
## $ deathIncrease <dbl> 0, 2, 44, 0, 1, 328, 7, 92, 7, 8, 156, ...
## $ hospitalizedIncrease <dbl> 0, 555, 45, 0, 219, 0, 26, 0, 0, 0, 168...
## $ hash <chr> "5441047eb1ea7dc0969257ef6dbee3b153f09c...
## $ commercialScore <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ negativeRegularScore <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ negativeScore <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ positiveScore <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ score <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ grade <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
##
##
## Control totals - note that validState other than TRUE will be discarded
##
## # A tibble: 2 x 6
## validState cases deaths hosp tests n
## <lgl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 FALSE 101614 1933 NA 561852 1580
## 2 TRUE 24833427 409890 NA 296195058 16894
## Rows: 16,894
## Columns: 6
## $ date <date> 2021-01-25, 2021-01-25, 2021-01-25, 2021-01-25, 2021-01-25,...
## $ state <chr> "AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", ...
## $ cases <dbl> 83, 1839, 636, 5321, 27007, 1177, 5817, 204, 616, 8542, 3530...
## $ deaths <dbl> 0, 2, 44, 1, 328, 7, 92, 7, 8, 156, 53, 0, 0, 1, 64, 12, 24,...
## $ hosp <dbl> 54, 2285, 1084, 4229, 18347, 737, 1068, 248, 385, 6899, 5231...
## $ tests <dbl> 3583, 4456, 9146, 44580, 403193, 16562, 89370, 5786, 8116, 8...
## Rows: 16,894
## Columns: 14
## $ date <date> 2020-01-13, 2020-01-14, 2020-01-15, 2020-01-16, 2020-01-17,...
## $ state <chr> "WA", "WA", "WA", "WA", "WA", "WA", "WA", "WA", "WA", "MA", ...
## $ cases <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ hosp <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tests <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, ...
## $ cpm <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.000...
## $ dpm <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ hpm <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tpm <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.000...
## $ cpm7 <dbl> NA, NA, NA, 0.01876023, 0.01876023, 0.03752046, 0.03752046, ...
## $ dpm7 <dbl> NA, NA, NA, 0, 0, 0, 0, 0, 0, NA, 0, NA, 0, NA, 0, 0, 0, 0, ...
## $ hpm7 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tpm7 <dbl> NA, NA, NA, 0.00000000, 0.00000000, 0.00000000, 0.00000000, ...
##
## Recency is defined as 2020-12-27 through current
##
## Recency is defined as 2020-12-27 through current
saveToRDS(ctp_hier7_210126, ovrWriteError=FALSE)
The main function is modified slightly yo allow for taking advantage of the updated functions:
# Function to download/load, process, segment, and analyze data from COVID Tracking Project
readRunCOVIDTrackingProject <- function(thruLabel,
downloadTo=NULL,
readFrom=downloadTo,
compareFile=NULL,
dateChangePlot=FALSE,
dateMetricPrint=TRUE,
writeLog=NULL,
ovrwriteLog=TRUE,
dfPerCapita=NULL,
useClusters=NULL,
hierarchical=TRUE,
returnList=!hierarchical,
kCut=6,
reAssignState=vector("list", 0),
makeCumulativePlots=TRUE,
skipAssessmentPlots=FALSE,
...
) {
# FUNCTION ARGUMENTS:
# thruLabel: the label for when the data are through (e.g., "Aug 30, 2020")
# donwloadTo: download the most recent COVID Tracking Project data to this location
# NULL means do not download any data
# readFrom: location for reading in the COVID Tracking Project data (defaults to donwloadTo)
# compareFile: name of the file to use for comparisons when reading in raw data (NULL means no comparison)
# dateChangePlot: boolean, should changes in dates be captured as a plot rather than as a list?
# dateMetricPrint: boolean, should the changes by date and metric be printed to the main log?
# writeLog: name of a separate log file for capturing detailed data on changes between files
# NULL means no detailed data captured
# ovrwriteLog: boolean, should the log file be overwritten and started again from scratch?
# dfPerCapita: file can be passed directly, which bypasses the loading and processing steps
# useClusters: file containing clusters by state (NULL means make the clusters from the data)
# hierarchical: boolean, should hierarchical clusters be produced (if FALSE, will be k-means)?
# returnList: boolean, should a list be returned or just the cluster object?
# refers to what is returned by clusterStates(); the main function always returns a list
# kCut: number of segments when cutting the hierarchical tree
# reAssignState: mapping file for assigning a state to another state's cluster
# format list("stateToChange"="stateClusterToAssign")
# makeCumulativePlots: whether to make plots of cumulative metrics
# skipAssessmentPlots: boolean to skip the plots for assessClusters()
# especially useful if just exploring dendrograms or silhouette widths
# ...: arguments to be passed to clusterStates(), will be used only if useClusters is NULL
# STEP 1: Get state data
stateData <- getStateData()
# Helper function for glimpsing
glimpseFile <- function(x, txt) {
cat(txt)
glimpse(x)
}
# STEPS 2-4 are run only if dfPerCapita does not exist
if (is.null(dfPerCapita)) {
# STEP 2a: Download latest COVID Tracking Project data (if requested)
if (!is.null(downloadTo)) downloadCOVIDbyState(fileName=downloadTo)
# STEP 2b: Read-in COVID Tracking Project data
dfRaw <- readCOViDbyState(readFrom,
checkFile=compareFile,
dateChangePlot=dateChangePlot,
dateMetricPrint=dateMetricPrint,
writeLog=writeLog,
ovrwriteLog=ovrwriteLog
)
if (is.null(writeLog)) glimpseFile(dfRaw, txt="\nRaw data file:\n")
else capture.output(glimpseFile(dfRaw, txt="\nRaw data file:\n"), file=writeLog, append=TRUE)
# STEP 3: Process the data so that it includes all requested key variables
varsFilter <- c("date", "state", "positiveIncrease", "deathIncrease",
"hospitalizedCurrently", "totalTestResultsIncrease"
)
dfFiltered <- processCVData(dfRaw,
varsKeep=varsFilter,
varsRename=c(positiveIncrease="cases",
deathIncrease="deaths",
hospitalizedCurrently="hosp",
totalTestResultsIncrease="tests"
)
)
if (is.null(writeLog)) glimpseFile(dfFiltered, txt="\nFiltered data file:\n")
else capture.output(glimpseFile(dfFiltered, txt="\nFiltered data file:\n"), file=writeLog, append=TRUE)
# STEP 4: Convert to per capita
dfPerCapita <- helperMakePerCapita(dfFiltered,
mapVars=c("cases"="cpm", "deaths"="dpm",
"hosp"="hpm", "tests"="tpm"
),
popData=stateData
)
if (is.null(writeLog)) glimpseFile(dfPerCapita, txt="\nPer capita data file:\n")
else capture.output(glimpseFile(dfPerCapita, txt="\nPer capita data file:\n"),
file=writeLog,
append=TRUE
)
} else {
dfRaw <- NULL
dfFiltered <- NULL
}
# STEP 5: Create the clusters (if they have not been passed)
if (is.null(useClusters)) {
# Run the clustering process
clData <- clusterStates(df=dfPerCapita, hierarchical=hierarchical, returnList=returnList, ...)
# If hierarchical clusters, cut the tree, otherwise use the output object directly
if (hierarchical) {
useClusters <- cutree(clData, k=kCut)
} else {
useClusters <- clData$objCluster$cluster
}
# If requested, manually assign clusters to the cluster for another state
for (xNum in seq_len(length(reAssignState))) {
useClusters[names(reAssignState)[xNum]] <- useClusters[reAssignState[[xNum]]]
}
}
# STEP 5a: Stop the process and return what is available if skipAssessmentPlots is TRUE
if (skipAssessmentPlots) {
return(list(stateData=stateData,
dfRaw=dfRaw,
dfFiltered=dfFiltered,
dfPerCapita=dfPerCapita,
useClusters=useClusters,
plotData=NULL,
consolidatedPlotData=NULL,
clCum=NULL
)
)
}
# STEP 6: Create the cluster assessments
plotData <- assessClusters(useClusters,
dfState=stateData,
dfBurden=dfPerCapita,
thruLabel=thruLabel,
plotsTogether=TRUE
)
# STEP 7: Plot the consolidated metrics
subT <- "Cases: new cases, Deaths: new deaths, Hosp: total in hospital (not new), Tests: new tests"
consolidatedPlotData <- plotConsolidatedMetrics(plotData,
varMain=c("state", "cluster", "date", "pop",
"cases", "deaths", "hosp", "tests"
),
subT=subT,
nrowPlot2=2
)
# STEP 8: Create cumulative metrics if requested
if (makeCumulativePlots) {
consPos <- consolidatedPlotData %>%
ungroup() %>%
select(state, cluster, date, name, vpm7) %>%
arrange(state, cluster, date, name) %>%
pivot_wider(-vpm7, names_from="name", values_from="vpm7") %>%
mutate(pctpos=cases/tests) %>%
pivot_longer(-c(state, cluster, date), values_to="vpm7") %>%
filter(!is.na(vpm7))
clCum <- makeCumulative(consPos)
plotCumulativeData(clCum,
keyMetricp2="",
flagsp2="",
makep1=TRUE,
makep2=FALSE
)
plotCumulativeData(clCum,
keyMetricp2="deaths",
flagsp2=findFlagStates(clCum, keyMetricVal = "deaths")
)
plotCumulativeData(clCum,
keyMetricp2="cases",
flagsp2=findFlagStates(clCum, keyMetricVal = "cases")
)
plotCumulativeData(clCum,
keyMetricp2="tests",
flagsp2=findFlagStates(clCum, keyMetricVal = "tests")
)
} else {
clCum <- NULL
}
# STEP 9: Return a list of the key data
list(stateData=stateData,
dfRaw=dfRaw,
dfFiltered=dfFiltered,
dfPerCapita=dfPerCapita,
useClusters=useClusters,
plotData=plotData,
consolidatedPlotData=consolidatedPlotData,
clCum=clCum
)
}
The updated function is then run:
# Custom shape function for managing 2020 and 2021
customTimeBucket <- function(x) {
paste0(lubridate::year(x), "-", stringr::str_pad(lubridate::month(x), width=2, side="left", pad="0"))
}
# Create new segments with updated data
locDownload <- "./RInputFiles/Coronavirus/CV_downloaded_210127.csv"
locLog <- "./RInputFiles/Coronavirus/downloaded_20210127.log"
ctp_hier6_210127 <- readRunCOVIDTrackingProject(thruLabel="Jan 26, 2021",
downloadTo=if(file.exists(locDownload)) NULL else locDownload,
readFrom=locDownload,
compareFile=readFromRDS("test_hier5_201025")$dfRaw,
dateChangePlot=TRUE,
dateMetricPrint=FALSE,
writeLog=locLog,
hierarchical=TRUE,
reAssignState=list("HI"="ME", "VT"="ME"),
kCut=8,
shapeFunc=customTimeBucket,
minShape="2020-04",
maxShape="2021-01",
ratioDeathvsCase = 5,
ratioTotalvsShape = 0.25,
minDeath=100,
minCase=10000
)
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## state = col_character(),
## totalTestResultsSource = col_character(),
## dataQualityGrade = col_character(),
## lastUpdateEt = col_character(),
## dateModified = col_datetime(format = ""),
## checkTimeEt = col_character(),
## dateChecked = col_datetime(format = ""),
## fips = col_character(),
## hash = col_character(),
## grade = col_logical()
## )
## i Use `spec()` for the full column specifications.
##
## File is unique by state and date
##
##
## Overall control totals in file:
## # A tibble: 1 x 3
## positiveIncrease deathIncrease hospitalizedCurrently
## <dbl> <dbl> <dbl>
## 1 25078786 415557 17899714
##
## *** COMPARISONS TO REFERENCE FILE: compareFile
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: states
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: dates
## In reference but not in current: 0
## In current but not in reference: 103
## Detailed differences available in: ./RInputFiles/Coronavirus/downloaded_20210127.log
##
##
## Difference of 5+ that is at least 1% (summed to date and metric): 190
## Detailed output available in log: ./RInputFiles/Coronavirus/downloaded_20210127.log
## Warning: Removed 103 row(s) containing missing values (geom_path).
##
##
## Difference of 5+ that is at least 1% (summed to state and metric): 12
## state name newValue oldValue
## 1 AK positiveIncrease 12523 13535
## 2 CO positiveIncrease 93398 91570
## 3 CO deathIncrease 2218 2076
## 4 FL positiveIncrease 766305 776249
## 5 ND deathIncrease 453 345
## 6 NJ positiveIncrease 236393 227339
## 7 NM positiveIncrease 41040 40168
## 8 NM hospitalizedCurrently 27399 27120
## 9 PR positiveIncrease 31067 61275
## 10 RI positiveIncrease 30581 30116
## 11 WA positiveIncrease 105278 101345
## 12 WA hospitalizedCurrently 92643 69716
##
##
## Control totals - note that validState other than TRUE will be discarded
## (na.rm=TRUE)
##
## # A tibble: 2 x 6
## validState cases deaths hosp tests n
## <lgl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 FALSE 102108 1938 104714 563049 1585
## 2 TRUE 24976678 413619 17795000 297888469 16945
##
## Recency is defined as 2020-12-28 through current
##
## Recency is defined as 2020-12-28 through current
saveToRDS(ctp_hier6_210127, ovrWriteError=FALSE)
The updated functionality appears to be working for 2021.
The assessClusters() function can be used to create different clusters, such as for US census regions:
stateCensus <- c(as.character(state.region), "South")
names(stateCensus) <- c(state.abb, "DC")
ctp_census_hier6_210127 <- assessClusters(stateCensus,
dfState=ctp_hier6_210127$stateData,
dfBurden=ctp_hier6_210127$dfPerCapita,
thruLabel="Jan 26, 2021",
plotsTogether=TRUE,
makeTotalvsPerCapitaPlots=FALSE,
makeRecentvsTotalPlots=FALSE
)
ctp_census_hier6_210127
## # A tibble: 16,945 x 17
## state cluster name pop date cases deaths hosp tests cpm dpm
## <chr> <fct> <chr> <dbl> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AL South Alab~ 4.90e6 2020-03-07 0 0 NA 0 0 0
## 2 AL South Alab~ 4.90e6 2020-03-08 0 0 NA 0 0 0
## 3 AL South Alab~ 4.90e6 2020-03-09 0 0 NA 0 0 0
## 4 AL South Alab~ 4.90e6 2020-03-10 0 0 NA 0 0 0
## 5 AL South Alab~ 4.90e6 2020-03-11 0 0 NA 10 0 0
## 6 AL South Alab~ 4.90e6 2020-03-12 0 0 NA 0 0 0
## 7 AL South Alab~ 4.90e6 2020-03-13 1 0 NA 2 0.204 0
## 8 AL South Alab~ 4.90e6 2020-03-14 5 0 NA 16 1.02 0
## 9 AL South Alab~ 4.90e6 2020-03-15 6 0 NA 12 1.22 0
## 10 AL South Alab~ 4.90e6 2020-03-16 16 0 NA 16 3.26 0
## # ... with 16,935 more rows, and 6 more variables: hpm <dbl>, tpm <dbl>,
## # cpm7 <dbl>, dpm7 <dbl>, hpm7 <dbl>, tpm7 <dbl>